为啥会出现索引超出范围,是哪个地方出问题了
其他问答
1
import requests import re import pymysql import time def baidu(company): ur1 = 'https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=' + company headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36 Edg/96.0.1054.29'} res = requests.get(ur1, headers=headers, timeout=10).text p_title = 'data-click="{.*?}"><!--s-text-->(.*?)<!--/s-text--></a>' p_href = '<h3 ><a href="(.*?)" target="_blank" ' p_date = '<span aria-label="发布于:.*?">(.*?)</span>' p_info = '<span aria-label="新闻来源:.*?">(.*?)</span>' href = re.findall(p_href, res, re.S) title = re.findall(p_title, res, re.S) date = re.findall(p_date, res, re.S) info = re.findall(p_info, res, re.S) for i in range(len(title)): title[i] = title[i].strip() title[i] = re.sub('<.*?>', '', title[i]) title[i] = re.sub('[...]', '', title[i]) title[i] = re.sub('&.*?', '', title[i]) title[i] = re.sub('#', '', title[i]) title[i] = re.sub('#', '', title[i]) href[i] = href[i].strip() date[i] = date[i].strip() date[i] = re.sub('月', '-', date[i]) date[i] = re.sub('日', '', date[i]) if ('小时' in date[i]) or ('分钟' in date[i]): date[i] = time.strftime("%Y-%m-%d") else: date[i] = date[i] # print(str(i+1)+'.'+title[i]) # print(href[i]) # print(company+'该条舆情评分为'+str(score[i])) #舆情0数据评分4.0 score = [] keywords = ['违约', '诉讼', '兑付', '投诉'] for i in range(len(title)): num = 0 try: article = requests.get(href[i],headers = headers,timeout =10).text except: article = '单个新闻爬取失败' try: article = article.encode('ISO-8859-1').decode('utf-8') except: try: article = article.encode('ISO-8859-1').decode('gbk') except: article = article p_article = '<p>(.*?)</p>' article_main = re.findall(p_article,article) article = ''.join(article_main) for k in keywords: if (k in article) or (k in title[i]): num -= 5 if ('违约' in article): num -= 10 score.append(num) company_re = company[0]+'.{0,5}'+company[-1] if len(re.findall(company_re,company))<1: title[i]='' href[i]='' date[i]='' info[i]='' score[i]='' while''in title: title.remove('') while''in href: href.remove('') while''in date: date.remove('') while''in info: info.remove('') while''in score: score.remove('') for i in range(len(title)): print(str(i+1)+'.'+title[i]+'('+date[i]+' '+info[i]+')') print(href[i]) print(company+'该条新闻的舆情评分为'+ str(score[i])) #入数据库 for i in range(len(date)): db = pymysql.connect(host='localhost', port=3306, user='root', password='', database='pachongnew',charset='utf8') cur = db.cursor() sql_1 = 'SELECT * FROM article WHERE company = %s' cur.execute(sql_1, company) data_all = cur.fetchall() title_all = [] for j in range(len(data_all)): title_all.append(data_all[j][1]) if title[i] not in title_all: sql_2 = 'INSERT INTO article(company,title,href,date,info,score) VALUES (%s,%s,%s,%s,%s,%s)' cur.execute(sql_2,(company,title[i],href[i],date[i],info[i],score[i])) db.commit() cur.close() db.close() print('-------------------------------------------------------') baidu('腾讯') #各种公司 # companys = ['阿里巴巴', '京东', '华能信托', '腾讯','百度集团'] # for i in companys: # try: # baidu(i) # print(i + '百度新闻爬取成功') # except: # print(i + '百度新闻爬取失败')
发表回复