Python爬取糗事百科源代码1.0

糗事百科爬虫源代码1.0

'''
#=============================================================================
#     FileName: qiushibaike.py
#         Desc: 
#       Author: modys
#        Email: http://www.modys.top
#     HomePage: http://www.modys.top
#      Version: 1.0
#   LastChange: 2020-08-23 19:11:02
#      History:
#=============================================================================
'''
# 爬去糗事百科段子
from urllib import request
import re

headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0')
opener = request.build_opener()
opener.addheaders = [headers]
# 安装全局
request.install_opener(opener)
fh = open('C:\\Users\\Administrator\\Desktop\\qiushibaike.txt', 'w', encoding='utf-8')
# 爬去前5页
for i in range(0, 5):
    url = 'https://www.qiushibaike.com/text/page/' + str(1+i)
    req = request.urlopen(url).read().decode('utf-8', 'ignore')
    print(len(req))
    print(url)
    pat = '<div class="content">.*?<span>(.*?)</span>.*?</a>'
    rst = re.compile(pat, re.S).findall(req)
    for i in range(0, len(rst)):
        # print(rst[i].strip())
        # print(rst[i].strip() + "\n", end="---------\n")
        fh.write(str(rst[i].strip()) + "\n-----------\n")

        
fh.close()

发表评论

电子邮件地址不会被公开。 必填项已用 * 标注