百度爬虫

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u014119694/article/details/82425232

百度爬虫

获取人名

# coding: utf-8
import requests
from lxml import etree
from lxml.etree import HTMLParser

proxies = {

}


#r=requests.get('http://www.baidu.com',proxies=proxies)

'''
headers = {
    "Host": "www.zhihu.com",
    "Referer": "https://www.zhihu.com/",
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}


session = requests.session()
response = session.get("https://www.zhihu.com", headers=headers, proxies=proxies,verify=False)
#proxy.huawei.com
'''

#NameLists=[]
def getName(link):
    print(link)
    NameList=[]
    r=requests.get(link,proxies=proxies)
    try:
        r.encoding='gbk'
        html=etree.HTML(r.text)
        NameList=html.xpath('//div[@class="i_cont_s"]/a/text()')
        #print(NameList)
        with open('Name.txt', 'a+') as f:
            f.write('\n'.join(NameList))
    except:
        print('-----------: ',link)
        pass
    return(len(NameList))
    #print(NameList)

#A-Z
'''
baselink='http://www.manmankan.com/dy2013/mingxing/'
for i in range(ord('A'),ord('Z')+1):
    link=baselink+chr(i)+'/'
    getName(link)
    page=2
    while(1):
        slink=link+'index_'+str(page)+'.shtml'
        lens=getName(slink)

        page+=1
        if(lens<1):
            break


'''
Links=[
'http://www.manmankan.com/dy2013/mingxing/yanyuan/neidi/',
'http://www.manmankan.com/dy2013/mingxing/yanyuan/xianggang/',
'http://www.manmankan.com/dy2013/mingxing/yanyuan/taiwan/',
'http://www.manmankan.com/dy2013/mingxing/yanyuan/riben/',
'http://www.manmankan.com/dy2013/mingxing/yanyuan/oumei/',
'http://www.manmankan.com/dy2013/mingxing/yanyuan/hanguo/',
'http://www.manmankan.com/dy2013/mingxing/geshou/neidi/',
'http://www.manmankan.com/dy2013/mingxing/geshou/xianggang/',
'http://www.manmankan.com/dy2013/mingxing/geshou/taiwan/',
'http://www.manmankan.com/dy2013/mingxing/geshou/riben/',
'http://www.manmankan.com/dy2013/mingxing/geshou/oumei/',
'http://www.manmankan.com/dy2013/mingxing/geshou/hanguo/'
]
for link in Links:
    getName(link)

爬照片

# coding: utf-8
'''
with open('Name.txt', 'r+') as f:
    NameList=f.read().splitlines()
print(len(NameList))
'''
import requests
import os
import multiprocessing
proxies = {
    "http": "http://d84105117:@[email protected]:8080/", #注意最后的'/'一定要有
    "https": "http://d84105117:@[email protected]:8080/"
}

def getManyPages(keyword,pages):
    params=[]
    for i in range(0,30*pages,30):
        params.append({
                      'tn': 'resultjson_com',
                      'ipn': 'rj',
                      'ct': 201326592,
                      'is': '',
                      'fp': 'result',
                      'queryWord': keyword,
                      'cl': 2,
                      'lm': -1,
                      'ie': 'utf-8',
                      'oe': 'utf-8',
                      'adpicid': '',
                      'st': -1,
                      'z': '',
                      'ic': 0,
                      'word': keyword,
                      's': '',
                      'se': '',
                      'tab': '',
                      'width': '',
                      'height': '',
                      'face': 0,
                      'istype': 2,
                      'qc': '',
                      'nc': 1,
                      'fr': '',
                      'pn': i,
                      'rn': 30,
                      'gsm': '1e',
                      '1536131285172': ''
                  })
    url = 'https://image.baidu.com/search/acjson'

    urls = []
    for param in params:
        try:
            print(requests.get(url,params=param,proxies=proxies).url)
            urls.append(requests.get(url,params=param,proxies=proxies).json().get('data'))
        except Exception as e:
            pass


    return urls


def getImg(dataList, localPath):

    if not os.path.exists(localPath):  # 新建文件夹
        os.mkdir(localPath)

    x = 0
    for list in dataList:
        for i in list:
            if i.get('thumbURL') != None:
                print('正在下载:%s' % i.get('thumbURL'))
                ir = requests.get(i.get('thumbURL'),proxies=proxies,timeout=15, verify=False)
                open(localPath + '%d.jpg' % x, 'wb').write(ir.content)
                x += 1
            else:
                pass
                #print('图片链接不存在')
def spider(keyword):
    print('正在处理',keyword)
    dataList = getManyPages(keyword,3)  # 参数1:关键字,参数2:要下载的页数
    getImg(dataList,keyword+'/') # 参数2:指定保存的路径
    with open('ok.txt', 'a+') as f:
       f.write(keyword+'\n')

if __name__ == '__main__':
    with open('Name.txt', 'r+') as f:
        NameList=f.read().splitlines()
    with open('ok.txt', 'r+') as f:
        OkList=f.read().splitlines()

    pool=multiprocessing.Pool(processes=4)

    for keyword in NameList:
        if keyword in OkList:
            print(keyword+' is already ok,continue-----')
            continue

        pool.apply_async(spider, args=(keyword, ))

    pool.close()
    pool.join()  


图片裁剪

# coding: utf-8
import mxnet as mx
from mtcnn_detector import MtcnnDetector
import cv2
import os
import time

import numpy as np


detector = MtcnnDetector(model_folder='model', ctx=mx.cpu(0), num_worker = 4 , accurate_landmark = False)

base_dirs=r'D:\code\python\china\\'
dirlist=os.listdir(base_dirs)
for dirs in dirlist:
    savedir=r'D:\data\\'+dirs
    if not os.path.exists(savedir):  # 新建文件夹
        os.mkdir(savedir)
    #else:
    #    continue
    base_dir=base_dirs+dirs+'\\'
    index=0
    imagelist=os.listdir(base_dir)
    while index<len(imagelist):


        imagecp=base_dir+imagelist[index]      
        print(imagecp)

        img = cv2.imdecode(np.fromfile(imagecp, dtype=np.uint8), 1) 
        results = detector.detect_face(img)



        if results is not None and len(results[0])==1:

            total_boxes = results[0]
            points = results[1]

            #draw = img.copy()
            b=total_boxes[0]
            print(b)
            try:
                bound0=(b[3]-b[1])/2
                bound1=(b[2]-b[0])/2
                b[1]-=bound0
                b[0]-=bound1
                if b[1]<0:b[1]=0
                if b[0]<0:b[0]=0
                b[2]+=bound1
                b[3]+=bound0
                if b[2]>img.shape[1]:b[2]=img.shape[1]
                if b[3]>img.shape[0]:b[3]=img.shape[0]




                imageok=img[int(b[1]):int(b[3]),int(b[0]):int(b[2])]
                if imageok.shape[0]>100 and imageok.shape[1]>100:
                    diss=savedir+'\\'+imagelist[index]

                    #cv2.imwrite(diss,imageok)

                    cv2.imencode('.jpg', imageok)[1].tofile(diss)
                    cv2.rectangle(img, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255))
                    #cv2.imshow("img",img)
                    #key = cv2.waitKey(0)

            except:
                pass
        index+=1
        '''
                cv2.rectangle(draw, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255))

            for p in points:
                for i in range(5):
                    cv2.circle(draw, (p[i], p[i + 5]), 1, (0, 0, 255), 2)
            cv2.imshow("img",draw)
            key = cv2.waitKey(0)
            '''    
        '''
        if(results is not None and len(results[0])==1):
            try:
                b=results[0][0]
                imageok=img[int(b[1])-50:int(b[0])+50,int(b[3])-50:int(b[2])+50]
                if imageok.shape[0]<100 or imageok.shape[1]<100:
                    continue
                diss=savedir+'\\'+imagelist[index]
                cv2.imwrite(diss,imageok)
               # cv2.rectangle(img, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255))

                #cv2.imshow("img",imageok)
                #key = cv2.waitKey(0)

            except:
                pass
        '''


        '''

MTCNN:
https://github.com/pangyupo/mxnet_mtcnn_face_detection
Reference:
https://blog.csdn.net/qq_32166627/article/details/60882964

猜你喜欢

转载自blog.csdn.net/u014119694/article/details/82425232