Today I wrote a story about crawling csdn articles
It’s not very good for beginners. The following code can be optimized by yourself when you have time.
Create a new python file, copy my code into it and save it
Install the dependency python
import re import urllib.error import urllib.request import us import tkinter as tk from bs4 import BeautifulSoup def main(): baseurl = entry_url.get() # baseurl = "https://blog.csdn.net/qq_57420582/article/details/133796601?spm=1001.2014.3001.5502" getDate(baseurl) # print(''.join(datelist)) win.destroy() def getDate(baseurl): html = askURL(baseurl) soup = BeautifulSoup(html,"html.parser") datelist = [] pattern = r'src="(.*?)"' htmlname = soup.find('h1', class_="title-article") htmlname = str(htmlname).replace("<p>", "").replace("</h1>", "") datelist.append(htmlname[48:] + "\n") htmledit = soup.find('div',class_="htmledit_views") for item in htmledit: item = str(item).replace("<p>","").replace("</p>","\n").replace("<br/>","\n").replace("<h2>","").replace("</h2>","\n").replace("<blockquote>","").replace("<!-- -->","").replace("</blockquote>","\n") # datelist.append(item) match = re.search(pattern, item) if match: url = match.group(1) datelist.append(url+"\n") else: datelist.append(item) desktop_path = os.path.join(os.path.expanduser("~"), "Desktop") # Find the path of the user's desktop savepath = os.path.join(desktop_path, "csdn crawl file") # Folder name saveDate(''.join(datelist),savepath,htmlname[48:]) return datelist def askURL(url): head = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"} request=urllib.request.Request(url,headers=head) html="" try: response=urllib.request.urlopen(request) html=response.read().decode("utf-8") # print(html) except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) return html def saveDate(datalist,savepath,savename): if not os.path.isdir(savepath): # Determine whether the folder exists, if not, create it os.mkdir(savepath) # Create with open(savepath + "\\" + savename +".txt", 'w', encoding='utf-8') as file: # Open this file file.write(datalist) # print text def frame_center(window, width, height): screen_width = window.winfo_screenwidth() screen_height = window.winfo_screenheight() x = (screen_width - width) // 2 y = (screen_height - height) // 2 window.geometry(f"{width}x{height}+{x}+{y}") if __name__ == '__main__': win = tk.Tk() win.title('csdn crawler') # Window name frame_center(win, 400, 200) # window size win.resizable(False, False) label = tk.Label(win, text='Please enter the URL:') # Text in the window label.pack(pady=10) # Window content spacing label1 = tk.Label(win, text='(It takes some time to crawl!)') label1.pack(pady=0) entry_url = tk.Entry(win, width=30) # Input box in window entry_url.pack(pady=5) btn_record = tk.Button(win, text='Start climbing!', command=main) # Button in window btn_record.pack(pady=40) print("Crawling completed") win.mainloop()
Then download Axialis IconWorkshop
This is to select a picture from local
Sure
keep
Then we have an icon file
Create a new folder and put the images and python files in it
Enter cmd above and press Enter to open the command line window.
pip install Pyinstaller
Pyinstaller -F -w -i klk.ico klk.py
Then we got it, the exe file is in the dist folder
We run the exe program
pop up this window
Select an article in csdn, copy the url and paste it here
Click to start crawling
You can see on the computer desktop that a csdn crawl file is generated.
This is the content you crawled to.