上代码:
#!/usr/bin/python3 import queue import threading import requests,csv,time,random from bs4 import BeautifulSoup from fake_useragent import UserAgent import pandas as pd exitFlag = 0 #利用pandas读取csv文件 def getNames(csvfile): data = pd.read_csv(csvfile,delimiter='|') # 1--读取的文件编码问题有待考虑 names = data['EnName'] return names #获取ip列表 def get_ip_list(): f=open('ip.txt','r') ip_list=f.readlines() f.close() return ip_list #从IP列表中获取随机IP def get_random_ip(ip_list): proxy_ip = random.choice(ip_list) proxy_ip=proxy_ip.strip('\n') proxies = {'https': proxy_ip} return proxies #功能:将信息写入文件 def write_file(filePath,row): with open(filePath,'a+',encoding='utf-8',newline='') as csvfile: spanreader = csv.writer(csvfile,delimiter='|',quoting=csv.QUOTE_MINIMAL) spanreader.writerow(row) def get_content(url,ip_list): try: try: time.sleep(1) proxies = get_random_ip(ip_list) headers = {'User-Agent':str(UserAgent().random)} req = requests.get(url=url, proxies=proxies,headers=headers,timeout=20) except: print("重新运行") time.sleep(10) proxies = get_random_ip(ip_list) headers = {'User-Agent':str(UserAgent().random)} req = requests.get(url=url, proxies=proxies,headers=headers,timeout=40) except: print("第二次重新运行") time.sleep(15) proxies = get_random_ip(ip_list) headers = {'User-Agent':str(UserAgent().random)} req = requests.get(url=url, proxies=proxies,headers=headers) req.encoding = 'utf-8' soup = BeautifulSoup(req.text,'lxml') content = soup.find_all('div',class_='mbox') return req.status_code, content #获取准确的英文名、中文名、名字含义、来源、性别等信息 def get_infor_header(content): content = content.find_all('span') EnName = [] CnName = [] Gender = [] Source = [] Meaning = [] EnName.append(content[0].get_text()) if len(content) != 1: CnName.append(content[1].get_text()) Meaning.append(content[2].get_text()) Source.append(content[3].get_text()) Gender.append(content[4].em.get('title')) else: CnName.append('') Meaning.append('') Source.append('') Gender.append('') #信息的链接方式EnName|CnName|Gender|Source|Meaning list_header = EnName + CnName + Gender + Source + Meaning return list_header #获取英文名对应的名人 def get_infor_celebrity(content): content = content.find_all('li') list_celebrity = [] str_celebrity='' for each in content: if not str_celebrity: str_celebrity +=each.get_text() else: str_celebrity +='@' + each.get_text() list_celebrity.append(str_celebrity) return list_celebrity class myThread (threading.Thread): def __init__(self, threadID, name, q,ip_list): threading.Thread.__init__(self) self.threadID = threadID self.name = name self.q = q self.ip_list = ip_list def run(self): print ("开启线程:" + self.name) process_data(self.name, self.q,ip_list) print ("退出线程:" + self.name) def process_data(threadName, q,ip_list): while not exitFlag: queueLock.acquire() if not workQueue.empty(): data = q.get() queueLock.release() print ("%s processing %s" % (threadName, data)) url = 'http://ename.dict.cn/{}'.format(data) status_code, content = get_content(url,ip_list) if status_code==200: #获取准确的中文名、名字含义、来源、性别等信息 list_header = get_infor_header(content[0]) #获取名人信息 list_celebrity = get_infor_celebrity(content[1]) row = list_header + list_celebrity queueLock.acquire() write_file('haici_infor.csv',row) queueLock.release() else: queueLock.release() time.sleep(1) threadList = ["Thread-1", "Thread-2", "Thread-3", "Thread-4", "Thread-5", "Thread-6", "Thread-7", "Thread-8", "Thread-9", "Thread-10"] nameList = getNames('A-Z.csv') queueLock = threading.Lock() workQueue = queue.Queue(100000) threads = [] threadID = 1 # 创建新线程 ip_list = get_ip_list() for tName in threadList: thread = myThread(threadID, tName, workQueue,ip_list) thread.start() threads.append(thread) threadID += 1 # 填充队列 queueLock.acquire() for word in nameList: workQueue.put(word) queueLock.release() # 等待队列清空 while not workQueue.empty(): pass # 通知线程是时候退出 exitFlag = 1 # 等待所有线程完成 for t in threads: t.join() print ("退出主线程")