python crawls csdn article content

Today I wrote a story about crawling csdn articles

It’s not very good for beginners. The following code can be optimized by yourself when you have time.

Create a new python file, copy my code into it and save it

Install the dependency python

import re
import urllib.error
import urllib.request
import us
import tkinter as tk
from bs4 import BeautifulSoup


def main():
    baseurl = entry_url.get()
    # baseurl = "https://blog.csdn.net/qq_57420582/article/details/133796601?spm=1001.2014.3001.5502"
    getDate(baseurl)
    # print(''.join(datelist))
    win.destroy()

def getDate(baseurl):
    html = askURL(baseurl)
    soup = BeautifulSoup(html,"html.parser")
    datelist = []
    pattern = r'src="(.*?)"'
    htmlname = soup.find('h1', class_="title-article")
    htmlname = str(htmlname).replace("<p>", "").replace("</h1>", "")
    datelist.append(htmlname[48:] + "\n")
    htmledit = soup.find('div',class_="htmledit_views")
    for item in htmledit:
        item = str(item).replace("<p>","").replace("</p>","\n").replace("<br/>","\n").replace("<h2>","").replace("</h2>","\n").replace("<blockquote>","").replace("<!-- -->","").replace("</blockquote>","\n")
        # datelist.append(item)
        match = re.search(pattern, item)
        if match:
            url = match.group(1)
            datelist.append(url+"\n")
        else:
            datelist.append(item)
    desktop_path = os.path.join(os.path.expanduser("~"), "Desktop") # Find the path of the user's desktop
    savepath = os.path.join(desktop_path, "csdn crawl file") # Folder name
    saveDate(''.join(datelist),savepath,htmlname[48:])
    return datelist

def askURL(url):
    head = {
   "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"}
    request=urllib.request.Request(url,headers=head)
    html=""
    try:
        response=urllib.request.urlopen(request)
        html=response.read().decode("utf-8")
        # print(html)
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)

    return html

def saveDate(datalist,savepath,savename):
    if not os.path.isdir(savepath): # Determine whether the folder exists, if not, create it
        os.mkdir(savepath) # Create
    with open(savepath + "\\" + savename +".txt", 'w', encoding='utf-8') as file: # Open this file
        file.write(datalist) # print text

def frame_center(window, width, height):
    screen_width = window.winfo_screenwidth()
    screen_height = window.winfo_screenheight()
    x = (screen_width - width) // 2
    y = (screen_height - height) // 2
    window.geometry(f"{width}x{height}+{x}+{y}")


if __name__ == '__main__':
    win = tk.Tk()
    win.title('csdn crawler') # Window name
    frame_center(win, 400, 200) # window size
    win.resizable(False, False)

    label = tk.Label(win, text='Please enter the URL:') # Text in the window
    label.pack(pady=10) # Window content spacing

    label1 = tk.Label(win, text='(It takes some time to crawl!)')
    label1.pack(pady=0)

    entry_url = tk.Entry(win, width=30) # Input box in window
    entry_url.pack(pady=5)

    btn_record = tk.Button(win, text='Start climbing!', command=main) # Button in window
    btn_record.pack(pady=40)
    print("Crawling completed")
    win.mainloop()

Then download Axialis IconWorkshop  

This is to select a picture from local

Sure

keep

Then we have an icon file

Create a new folder and put the images and python files in it

Enter cmd above and press Enter to open the command line window.

pip install Pyinstaller

Pyinstaller -F -w -i klk.ico klk.py

Then we got it, the exe file is in the dist folder

We run the exe program

pop up this window

Select an article in csdn, copy the url and paste it here

Click to start crawling

You can see on the computer desktop that a csdn crawl file is generated.

This is the content you crawled to.

Guess you like

Origin blog.csdn.net/qq_57420582/article/details/134140738