Python爬取虎扑新闻

这个是在北京实习期的公司要求做的一个小的项目,简单在此做个主要内容的解析以及相关主要代码的分享。

简要解析

首先以爬取新闻内容以及来源等主要信息为例。

其中的题目以及时间和来源以及可以在静态页面上进行抓取,直接利用Python的相关库进行清洗提取即可。另外在主页面可以抓取到每一条新闻的内容页面,内容页面即包括了文字及图片,这些均可直接提取。

另外为了获取新闻的浏览量,我们选择了进入新闻相应的论坛进行了爬取,如下可知:

关于这个浏览量是实时变化的,所以需要额外解析。

可得实时变化的链接为:https://msa.hupu.com/thread_hit?tid=35418557&_=1589521335428

通过缩减URL可得:https://msa.hupu.com/thread_hit?tid=35418557

tid 即为每个文章URL上的的ID。

代码部分

主要贴上用Scrapy写的解析部分的代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/python
# -*- coding:UTF-8 -*-
'''
@time:2019.9.10
@author: junliu
'''
import requests
import re
from bs4 import BeautifulSoup
import os, sys
import time, datetime
import cookielib
import urllib2
reload(sys)
sys.setdefaultencoding("utf-8")
# 虎扑CBA新闻:https://voice.hupu.com/cba
# 虎扑NBA新闻:https://voice.hupu.com/nba
# 虎扑足球新闻:https://voice.hupu.com/soccer
# 虎扑中超足球新闻:https://voice.hupu.com/china
# 翻页示例:https://voice.hupu.com/china/page/
def get_new_info():
nba_url = 'https://voice.hupu.com/nba'
# comment_url = 'https://www.ixigua.com/api/commentv2/video_comment?group_id=6733070858316677644&item_id=6733070858316677644&group_source=2'
html = get_html_res(nba_url)
soup = BeautifulSoup(html, 'lxml')
new_list = soup.find('div', class_='news-list')
# print len(new_list.find_all('li'))
# print new_list.find_all('li')[-1]
for i in new_list.find_all('li'):
print i.find('div', class_="list-hd").get_text().strip()
print i.find_all('a')[0].get('href')
# get_new_content(i.find_all('a')[0].get('href'))
print i.find_all('a')[1].get('title')
st = i.find_all('a')[1].get('title')
format = datetime.datetime.strptime(st, '%Y-%m-%d %H:%M')
time_tuples = format.timetuple()
print int(time.mktime(time_tuples))
print '*' * 30
print i.find('span', class_="other-left").get_text().split(' ')[-1].rstrip()
name = i.find('span', class_="other-left").get_text().split(' ')[-1].rstrip()
keyword = '虎扑'
if keyword in name or keyword == name is True:
item = 0
else:
item = 1
print item
print '*' * 20
print i.find_all('a')[2].get('href')
print i.find_all('a')[3].get('href')
def get_new_content(url):
url = url
html = get_html_res(url)
soup = BeautifulSoup(html, 'lxml')
items = []
# if soup.find('div', class_="artical-main-content").img is not None:
# content = soup.find('div', class_="artical-main-content")
# for link in content.find_all('p'):
# if link.find('img') is not None:
# print link.find('img').get('src')
# else:
# pass
print soup.find('div', class_='artical-content-read')
# print soup.find('div', class_='artical-importantPic').img.get('alt')
# print soup.find('div', class_='artical-importantPic').img.get('src')
# print soup.find('div', class_="artical-main-content").get_text()
print soup.find('div', class_="artical-importantPic").img
content = soup.find('div', class_="artical-main-content")
for content_html in content.find_all('p'):
print content_html
def get_bbs_info(url):
# url = url
url = url
html = get_html_res(url)
soup = BeautifulSoup(html, 'lxml')
infos = soup.find('span', class_="browse").get_text().replace('\n', '').encode('utf-8')
comment_num = re.match('^.*?(\d+).*$', infos).group(1)
bbs_id = url.split('/')[-1].split('.')[0]
browse = get_html_res('https://msa.hupu.com/thread_hit?tid=' + bbs_id)
print comment_num, browse
item = [] # 用于存标签字段
item1 = [] # 用于存链接
tags = soup.find('div', class_="basketballTobbs_tag")
for tag in tags.find_all("a"):
item.append(tag.get('href'))
item1.append(tag.get_text())
print item, item1
def get_cookies(url):
url = url
session = requests.session()
response = session.get(url=url)
print session.cookies.get_dict()
def get_html_res(url):
session = requests.session()
url = url
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'cookie': '_cnzz_CV30020080=buzi_cookie%7Cdddeb463.80b3.1b1e.9890.4620154a3f87%7C-1; _dacevid3=dddeb463.80b3.1b1e.9890.4620154a3f87; _cnzz_CV30020080=buzi_cookie%7Cdddeb463.80b3.1b1e.9890.4620154a3f87%7C-1; Hm_visit=1568082562621; Hc_visit=1568082562631; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216d19010eaf7a9-0e6d57be88ed6f-38677501-1296000-16d19010eb0310%22%2C%22%24device_id%22%3A%2216d19010eaf7a9-0e6d57be88ed6f-38677501-1296000-16d19010eb0310%22%2C%22props%22%3A%7B%7D%7D; __dacevst=f0c555d7.42b2f9a7|1568097712872',
'Referer': 'https://voice.hupu.com/nba',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
request = session.get(url=url, headers=headers)
request.encoding = "utf-8"
html = request.text
return html
def main():
# get_xigua_info("https://www.ixigua.com/i6733070858316677644/")
get_new_info()
# get_new_content('https://voice.hupu.com/cba/2475206.html')
# get_cookies('https://voice.hupu.com/nba')
# get_bbs_info('https://bbs.hupu.com/29489225.html')
# get_html_res('https://voice.hupu.com/cba/2474694.html')
if __name__ == '__main__':
main()

其他的感觉也没什么好说的…

---------------本文终---------------

文章作者:刘俊

最后更新:2020年05月15日 - 14:05

许可协议: 转载请保留原文链接及作者。