python爬取论坛文章问题

毕设港湾 其他问答 1

需求是:把页面上的内容保存到word中,页面上有文字和图片。现在只实现了文字和图片分开保存,文字在前,图片在后,希望能实现按照页面上的顺序,文字和图片穿插保存。 我想在读取到内容后判断一下是文字还是图片,然后分别执行不同的代码进行保存,尝试了很久也没找到方法。希望能实现按照页面上的顺序保存到本地word中。

import requests
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches,Pt,RGBColor
from docx.oxml.ns import qn

headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"}

file = Document()
file.styles['Normal'].font.name = u'宋体'
file.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
file.styles['Normal'].font.size = Pt(12)
file.styles['Normal'].font.color.rgb = RGBColor(0,0,0)
url = 'https://bbs.tiexue.net/post_7023745_1.html'
title = '标题'

strhtml = requests.get(url,headers=headers,timeout=(4,3))
soup = BeautifulSoup(strhtml.text,'html.parser')
data = soup.select('#postContent > p[]')
pic = soup.select('#postContent > p[] > a > img')
file.add_paragraph(url)
file.add_paragraph(title)
for item1 in data:
  result1 = {
    'paragraph':item1.get_text()
  }
  file.add_paragraph(result1['paragraph'])

for item2 in pic:
  result2 = {
    'pic':item2.get('src')
  }
  pic = requests.get(result2['pic'],headers=headers,timeout=(4,3))
  with open('pic_tmp.png',"wb")as f:
    f.write(pic.content)
  file.add_picture('pic_tmp.png', width=Inches(6))

docxurl = title+'.docx'
file.save(docxurl)

回复

共1条回复 我来回复
  • 源码客栈网
    这个人很懒,什么都没有留下~
    评论
    import requests
    from bs4 import BeautifulSoup
    from docx import Document
    from docx.shared import Inches,Pt,RGBColor
    from docx.oxml.ns import qn
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"}
    file = Document()
    file.styles['Normal'].font.name = u'宋体'
    file.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
    file.styles['Normal'].font.size = Pt(12)
    file.styles['Normal'].font.color.rgb = RGBColor(0,0,0)
    url = 'https://bbs.tiexue.net/post_7023745_1.html'
    title = '标题'
    strhtml = requests.get(url,headers=headers,timeout=(4,3))
    soup = BeautifulSoup(strhtml.text,'html.parser')
    data = soup.select('#postContent > p[]')
    #pic = soup.select('#postContent > p[] > a > img')###不需要
    file.add_paragraph(url)
    file.add_paragraph(title)
    for item1 in data:
      if str(item1).find('<img')!=-1:################图片
        item2=item1.select('img')[0]
    
        result2 = {
          'pic':item2.get('src')
        }
        pic = requests.get(result2['pic'],headers=headers,timeout=(4,3))
        with open('pic_tmp.png',"wb")as f:
          f.write(pic.content)
        file.add_picture('pic_tmp.png', width=Inches(6))
      else:################文字
        result1 = {
          'paragraph':item1.get_text()
        }
        file.add_paragraph(result1['paragraph'])
    docxurl = title+'.docx'
    file.save(docxurl)
    
    0条评论

发表回复

登录后才能评论