爬虫连续代码从面向过程慢慢向面向对象过度,代码也会更好看,逻辑也更清晰
import requests
import re
def download(url):
response = requests.get(url).text
return response
def parse_page(html):
# # headers={
# "user - agent":"Mozilla / 5.0(Windows NT 6.1;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 67.0.3396.99Safari / 537.36"
# }
titles=re.findall('<div class="cont">.*?<b>(.*?)</b>',html,re.S)
# print(titles)
# print("kkk"*10)
return titles
def main():
url="https://www.gushiwen.org/default_1.aspx"
html = download(url)
print(html)
ss=parse_page(html)
print(ss)
# for title in ss:
with open ("ci.txt","w") as f:
f.write(ss[1])
if __name__ == "__main__":
main()