from urllib import request
import re
#构造请求信息,
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'}
i = 2693152
j = 1
#2693854
while i <= 2693172:
print(f'正在下载第{j}章……')
url =f'
http://www.yingsx.com/6_6366/{i}.html'
req = request.Request(url,headers=header)
html = request.urlopen(req).read().decode('utf-8')
pat1 =re.compile(r'<h1>(.*?)</h1>',re.I) #匹配章节
pat2 =re.compile(r'<div id="content">(.*?)<br />',re.I) #匹配正文开头
pat3 = re.compile(r' (.*?)<br />') #匹配正文中间的文字
pat4 = re.compile(r' (.*?)</div>') #匹配正文结尾
chapter = pat1.findall(html)
content_top = pat2.findall(html)
content_center = pat3.findall(html)
content_bottom = pat4.findall(html)
#把章节和内容,合并到一个列表
chapter = list(chapter[0] + '\n')
content_bottom = list(content_bottom[0] + '\n')
content = chapter + content_top +content_center +content_bottom
with open(f'E:/其他/飞升之后.txt','a') as f:
f.writelines(content)
i += 1
j += 1
print('已完成下载,请查收!')