Reptile crawling Jokes

Crawling website: http://xiaohua.zol.com.cn/youmo/

View web agency, there is a problem when crawling content joke:

1, need to enter the page number "More" link below for further crawling web content page to see more links more content, multi-task, this uses the thread pool, it can be effectively controlled system of concurrent threads . Avoidance When the system contains a large number of concurrent threads, resulting in system performance degradation, even lead to the collapse of the Python interpreter, introducing the thread pool, it takes less time, more efficient.

  • Create a thread pool threadpool.ThreadPool ()
  • The need to create thread pool task that is threadpool.makeRequests (), makeRequests store is to open multi-threaded function, and the function of the relevant parameters and callback function, where the callback function can not write (the default is no).
  • The creation of multiple tasks put to the thread pool, threadpool.putRequest ()
  • Wait until all the tasks processed theadpool.pool ()

2, check out the links joke page content, div elements inside the text distributed more confusing. Some distributed in the manner <p> Some links within the text belongs div, regular expressions can be solved.

Note 2 ways to get an element node:

1) lxml string node acquires

requests.get = RES (URL, headers = headers) 
HTML = res.text 

lxml acquired node wording 
Element = etree.HTML (HTML) 
divEle = element.xpath ( "// div [@ class = 'Article This article was-text']" ) [0] # obtain div node 
div = etree.tostring (divEle, encoding = 'utf-8') .decode ( 'utf-8') # is converted to a string div

2) writing a regular expression, filtered carriage, label tabs and p

# 第一种方式:replace
content = re.findall('<div class="article-text">(.*?)</div>',html,re.S)
content = content[0].replace('\r','').replace('\t','').replace('<p>','').replace('</p>','').strip() 

3) the regular expression written 2, filtered carriage returns, tabs, and the tab p

# 第二种方式:sub
for index in range(len(content)):
	content[index] = re.sub(r'(\r|\t|<p>|<\/p>)+','',content[index]).strip()

list = ''.join(content)
print(list)

 3, complete code

index.py

Requests Import 
Import ThreadPool 
Import Time 
Import os, SYS 
Import Re 
from lxml Import etree 
from lxml.html Import toString 


class ScrapDemo (): 
    next_page_url = "" # next page URL 
    page_num current page = 1 # 
    detail_url_list = 0 # details page URL address List 
    deepth = 0 # set crawl depth 
    headers = { 
        "User-Agent": "the Mozilla / 5.0 (the Windows NT 10.0; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 68.0.3440.84 Safari / 537.36 " 
    } 
    FileNum = 0 

    DEF the __init __ (Self, URL): 
        self.scrapyIndex (URL) 

    DEF threadIndex (Self, the urllist): # open thread pool 
        if len (urllist) == 0:
            print ( "Please enter the desired address crawling") 
            RES = requests.get (URL, headers = ScrapDemo.headers) 
            HTML res.text = 
            # = etree.HTML Element (HTML))
            return False
        Len = ScrapDemo.detail_url_list (urllist) 
        pool = threadpool.ThreadPool (len (urllist)) 
        Requests = threadpool.makeRequests (self.detailScray, urllist) 
        for REQ in Requests: # 2 View All list are placed in the thread pool open 
            pool .putRequest (REQ) 
            the time.sleep (0.5) 
        pool.wait () 

    DEF detailScray (Self, URL): # fetch data 
        IF Not URL == "": 
            URL = 'HTTP: //xiaohua.zol.com.cn /{}'.format(url) 
            # = divEle element.xpath ( "// div [@ class = 'Article This article was-text']") [0] # the Element div            
            self.downloadText (HTML)   

    DEF downloadText (Self, ELE ): # txt file to save 
        clist = re.findall ( '<div class = "article-text">(.*?)</div>',ele,re.S) 
        for index in the Range (len (clist)): 
            '' ' 
                regular expression: carriage was filtered off, the label tabs and p 
            '' ' 
            clist [index] = the re.sub (R & lt' (\ R & lt | \ T | <p> | <\ / p>) + ',' ', clist [ index])   
        Content = "." the Join (clist) 
        # Print (Content)   
        the basedir = os.path.dirname (__ file__) 
        filePath = the os.path.join(basedir)
        filename="xiaohua{0}-{1}.txt".format(ScrapDemo.deepth,str(ScrapDemo.fileNum))
        file=os.path.join(filePath,'file_txt',filename)
        try:
            f=open(file,"w")
            f.write(content)
            if ScrapDemo.fileNum == (ScrapDemo.detail_url_list - 1):
                print(ScrapDemo.next_page_url)
                print(ScrapDemo.deepth)
                if not ScrapDemo.next_page_url == "":
                    self.scrapyIndex(ScrapDemo.next_page_url)
        except Exception as e:
            print("Error:%s" % str(e))

        ScrapDemo.fileNum=ScrapDemo.fileNum+1
        print(ScrapDemo.fileNum)

    def scrapyIndex(self,url):  
        if not url == "":
            ScrapDemo.fileNum=0
            ScrapDemo.deepth=ScrapDemo.deepth+1
            print("开启第{0}页抓取".format(ScrapDemo.page_num))
            requests.get = RES (URL, headers = ScrapDemo.headers) 
            HTML = res.text 
            Element = etree.HTML (HTML) 
            a_urllist element.xpath = ( "A // [@ class = 'All-Read'] / @ the href ") # this page to view the full text of 
            next_page = element.xpath (" // a [ @ class = 'page-next'] / @ href ") # get the next page url 
            ScrapDemo.next_page_url = 'HTTP: // xiaohua. zol.com.cn/{}'.format(next_page[0])
            Not len IF (next_page) == 0 and ScrapDemo.next_page_url = URL:! 
                ScrapDemo.page_num = ScrapDemo.page_num +. 1 
                self.threadIndex (a_urllist [:])   
            the else: 
                Print ( 'the download is complete, the current page is the page {} '.format (ScrapDemo.page_num)) 
                the sys.exit ()

 runscrapy.py

from app import ScrapDemo

url="http://xiaohua.zol.com.cn/youmo/"
ScrapDemo(url)

Run as follows:

 

 A total of 1988 files, the download is complete.

 

Guess you like

Origin www.cnblogs.com/hqczsh/p/11531368.html