python爬取论坛文章问题
其他问答
1
需求是:把页面上的内容保存到word中,页面上有文字和图片。现在只实现了文字和图片分开保存,文字在前,图片在后,希望能实现按照页面上的顺序,文字和图片穿插保存。 我想在读取到内容后判断一下是文字还是图片,然后分别执行不同的代码进行保存,尝试了很久也没找到方法。希望能实现按照页面上的顺序保存到本地word中。
import requests from bs4 import BeautifulSoup from docx import Document from docx.shared import Inches,Pt,RGBColor from docx.oxml.ns import qn headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"} file = Document() file.styles['Normal'].font.name = u'宋体' file.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') file.styles['Normal'].font.size = Pt(12) file.styles['Normal'].font.color.rgb = RGBColor(0,0,0) url = 'https://bbs.tiexue.net/post_7023745_1.html' title = '标题' strhtml = requests.get(url,headers=headers,timeout=(4,3)) soup = BeautifulSoup(strhtml.text,'html.parser') data = soup.select('#postContent > p[]') pic = soup.select('#postContent > p[] > a > img') file.add_paragraph(url) file.add_paragraph(title) for item1 in data: result1 = { 'paragraph':item1.get_text() } file.add_paragraph(result1['paragraph']) for item2 in pic: result2 = { 'pic':item2.get('src') } pic = requests.get(result2['pic'],headers=headers,timeout=(4,3)) with open('pic_tmp.png',"wb")as f: f.write(pic.content) file.add_picture('pic_tmp.png', width=Inches(6)) docxurl = title+'.docx' file.save(docxurl)
-
import requests from bs4 import BeautifulSoup from docx import Document from docx.shared import Inches,Pt,RGBColor from docx.oxml.ns import qn headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"} file = Document() file.styles['Normal'].font.name = u'宋体' file.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') file.styles['Normal'].font.size = Pt(12) file.styles['Normal'].font.color.rgb = RGBColor(0,0,0) url = 'https://bbs.tiexue.net/post_7023745_1.html' title = '标题' strhtml = requests.get(url,headers=headers,timeout=(4,3)) soup = BeautifulSoup(strhtml.text,'html.parser') data = soup.select('#postContent > p[]') #pic = soup.select('#postContent > p[] > a > img')###不需要 file.add_paragraph(url) file.add_paragraph(title) for item1 in data: if str(item1).find('<img')!=-1:################图片 item2=item1.select('img')[0] result2 = { 'pic':item2.get('src') } pic = requests.get(result2['pic'],headers=headers,timeout=(4,3)) with open('pic_tmp.png',"wb")as f: f.write(pic.content) file.add_picture('pic_tmp.png', width=Inches(6)) else:################文字 result1 = { 'paragraph':item1.get_text() } file.add_paragraph(result1['paragraph']) docxurl = title+'.docx' file.save(docxurl)
发表回复