为啥会出现索引超出范围,是哪个地方出问题了

代码向导 其他问答 1
import requests
import re
import pymysql
import time


def baidu(company):
  ur1 = 'https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=' + company
  headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36 Edg/96.0.1054.29'}
  res = requests.get(ur1, headers=headers, timeout=10).text
  p_title = 'data-click="{.*?}"><!--s-text-->(.*?)<!--/s-text--></a>'
  p_href = '<h3 ><a href="(.*?)" target="_blank" '
  p_date = '<span aria-label="发布于:.*?">(.*?)</span>'
  p_info = '<span aria-label="新闻来源:.*?">(.*?)</span>'
  href = re.findall(p_href, res, re.S)
  title = re.findall(p_title, res, re.S)
  date = re.findall(p_date, res, re.S)
  info = re.findall(p_info, res, re.S)


  for i in range(len(title)):
    title[i] = title[i].strip()
    title[i] = re.sub('<.*?>', '', title[i])
    title[i] = re.sub('[...]', '', title[i])
    title[i] = re.sub('&.*?', '', title[i])
    title[i] = re.sub('#', '', title[i])
    title[i] = re.sub('#', '', title[i])
    href[i] = href[i].strip()
    date[i] = date[i].strip()
    date[i] = re.sub('月', '-', date[i])
    date[i] = re.sub('日', '', date[i])
    if ('小时' in date[i]) or ('分钟' in date[i]):
      date[i] = time.strftime("%Y-%m-%d")
    else:
      date[i] = date[i]
    # print(str(i+1)+'.'+title[i])
    # print(href[i])
    # print(company+'该条舆情评分为'+str(score[i]))

  #舆情0数据评分4.0
  score = []
  keywords = ['违约', '诉讼', '兑付', '投诉']
  for i in range(len(title)):
    num = 0
    try:
      article = requests.get(href[i],headers = headers,timeout =10).text
    except:
      article = '单个新闻爬取失败'

    try:
      article = article.encode('ISO-8859-1').decode('utf-8')
    except:
      try:
        article = article.encode('ISO-8859-1').decode('gbk')
      except:
        article = article

    p_article = '<p>(.*?)</p>'
    article_main = re.findall(p_article,article)
    article = ''.join(article_main)

    for k in keywords:
      if (k in article) or (k in title[i]):
        num -= 5
      if ('违约' in article):
        num -= 10
    score.append(num)

    company_re = company[0]+'.{0,5}'+company[-1]
    if len(re.findall(company_re,company))<1:
      title[i]=''
      href[i]=''
      date[i]=''
      info[i]=''
      score[i]=''
    while''in title:
      title.remove('')
    while''in href:
      href.remove('')
    while''in date:
      date.remove('')
    while''in info:
      info.remove('')
    while''in score:
      score.remove('')

    for i in range(len(title)):
      print(str(i+1)+'.'+title[i]+'('+date[i]+' '+info[i]+')')
      print(href[i])
      print(company+'该条新闻的舆情评分为'+ str(score[i]))

#入数据库
  for i in range(len(date)):
    db = pymysql.connect(host='localhost', port=3306, user='root', password='', database='pachongnew',charset='utf8')
    cur = db.cursor()
    sql_1 = 'SELECT * FROM article WHERE company = %s'
    cur.execute(sql_1, company)
    data_all = cur.fetchall()
    title_all = []
    for j in range(len(data_all)):
      title_all.append(data_all[j][1])
    if title[i] not in title_all:
      sql_2 = 'INSERT INTO article(company,title,href,date,info,score) VALUES (%s,%s,%s,%s,%s,%s)'
      cur.execute(sql_2,(company,title[i],href[i],date[i],info[i],score[i]))
      db.commit()
    cur.close()
    db.close()
  print('-------------------------------------------------------')

baidu('腾讯')

  #各种公司
# companys = ['阿里巴巴', '京东', '华能信托', '腾讯','百度集团']
# for i in companys:
#   try:
#     baidu(i)
#     print(i + '百度新闻爬取成功')
#   except:
#     print(i + '百度新闻爬取失败')

回复

共1条回复 我来回复
  • 代码工厂
    这个人很懒,什么都没有留下~
    评论
    title = re.findall(p_title, res, re.S)
        date = re.findall(p_date, res, re.S)
    
            title[i] = re.sub('#', '', title[i])
            href[i] = href[i].strip()
            date[i] = date[i].strip()
    
    0条评论

发表回复

登录后才能评论