爬虫spider流程示意图
根据以上流程简单实现爬虫功能,只是一种简单的做事风格,实际更复杂,不做具体讨论。
1. 目录
2. engine.py
# encoding=utf-8 import os from spider.scheduler import Scheduler def read_urls(file_path): with open(file_path, 'r+', encoding='utf-8') as fp: lines = fp.readlines() return [line.strip() for line in lines if line.strip()] def engine(): path = os.path.dirname(__file__) + '/urls.txt' urls = read_urls(path) htmls = Scheduler.download(urls) data = Scheduler.analysis(htmls) Scheduler.storage(data) if __name__ == '__main__': engine()
3. scheduler.py
# encoding=utf-8 from spider.downloader import Download from spider.analysis import Analysis from spider.storage import Storage class Scheduler: def __init__(self): pass @staticmethod def download(urls): urls = urls if isinstance(urls, list) else [urls] htmls = [] # 下载 for url in urls: htmls.append((url, Download.get(url))) return htmls @staticmethod def analysis(_tuple): """[(url, html), (url, html)]""" # 解析 data = [] for url, html in _tuple: data.append(Analysis.parse(url, html)) return data @staticmethod def storage(data): # 存储 for params in data: Storage.storage(params)
4. downloader.py
# encoding=utf-8 import requests class Download: """ 1. 高效爬取 2. 常见反反爬虫手段 3. 数据量的问题:并发, 分布式 """ def __init__(self): pass @staticmethod def get(url, headers={}): html = requests.get(url, headers=headers) return html.text @staticmethod def post(url, data={}, headers={}): html = requests.post(url, data=data, headers=headers) return html.text @staticmethod def get_headers(params): """...""" return params
5. storage.py
# encoding=utf-8 import hashlib import pymysql class Storage: table = 'spider' def __init__(self): pass @staticmethod def storage(params): """ insert or update params :param params: :return: """ sql_util = SqlUtil('127.0.0.1', 3306, 'root', '123456', 'mysql') _id = Storage.url2md5(url=params['url']) if sql_util.exists(Storage.table, _id): sql_util.update(Storage.table, where={'id': _id}, dict_value=params) else: sql_util.insert(Storage.table, params) @staticmethod def url2md5(url): if isinstance(url, str): url = url.encode('utf-8') m2 = hashlib.md5() m2.update(url) return m2.hexdigest() class SqlUtil: where_dict = { '$in': 'IN', '$like': 'LIKE', '$eq': '=', '$reg': 'REGEXP', '$ge': '>=', '$le': '<=', '$gt': '>', '$lt': '<' } CREATE_DATABASE = 'CREATE DATABASE IF NOT EXISTS {database}' CREATE_TABLE = 'CREATE TABLE {table} ({keys}, UNIQUE INDEX({index}))' CREATE_INDEX = 'CREATE INDEX {index} ON {table}' INSERT_TABLE = 'INSERT INTO {table} ({keys}) VALUES ({values})' DELETE_TABLE = 'DELETE FROM {table} WHERE {where}' UPDATE_TABLE = 'UPDATE {table} SET {update} WHERE {where}' # UPDATE_TABLE_ALL = 'UPDATE {table} SET {update}' SELECT_TABLE = 'SELECT {keys} FROM {table} WHERE {where}' SELECT_TABLE_ALL = 'SELECT * FROM {table} WHERE {where}' # INSERT_TABLE_UPDATE = 'INSERT INTO {table} {keys} VALUES {values} ON DUPLICATE KEY UPDATE {update}' MAX_RETRY_TIMES = 3 def __init__(self, ip, port, user, password, database, charset='utf-8'): """ @functions:__init__ @param: ip, 端口号, 用户名, 用户密码, 数据库编码 @return:none @summary: 初始化类参数, 获取数据库对应的数据库名即对应参数 """ self.ip = ip self.port = port self.user = user self.password = password self.database = database self.charset = charset self.connect = None self.cursor = None self.connected = False self.connection() def check(self): try: self.connect.ping() return True except: print('Server not available!') self.close() return self.connection() def connection(self): """ @functions:connect @param: none @return: 如果连接正常,返回True; 否则返回 False @summary: 连接数据库 @error desover: 解决error --- client does not support authentication mysql> alter user 'root'@'localhost' identified with mysql_native_password by '123456'; Query OK, 0 rows affected (0.10 sec) mysql> flush privileges; Query OK, 0 rows affected (0.01 sec) """ self.connect = pymysql.connect(host=self.ip, port=self.port, user=self.user, passwd=self.password, db=self.database) self.cursor = self.connect.cursor() self.connect.ping() return True def close(self): try: self.connect.close() except: pass def mogrify(self, sql, data=None): """sql防注入用法""" return self.cursor.mogrify(sql, data) def command(self, sql, data=[], find=False, _dict=True, _log=True): if _log: Logger.debug(sql) if self.check(): if _dict: cur = self.connect.cursor(pymysql.cursors.DictCursor) else: cur = self.connect.cursor() if data: cur.executemany(sql, data) else: cur.execute(sql) results = cur.fetchall() cur.close() self.connect.commit() if find: return results return True def insert(self, table, value_list): if not value_list: return 0 value_list = value_list if isinstance(value_list, list) else value_list keys, place, values = self.convert_insert_data(value_list) sql = self.mogrify(SqlUtil.INSERT_TABLE.format(table=table, keys=keys, values=place)) return self.command(sql, data=values) def find(self, table, where={}, keys=[], multi=True): w_keys, w_place, w_values = self.convert_where_data(where) if keys: sql = SqlUtil.SELECT_TABLE.format(table=table, keys=keys, where=w_place) else: sql = self.mogrify(SqlUtil.SELECT_TABLE_ALL.format(table=table, where=w_place), w_values) sql = self.mogrify(sql) if not multi else self.mogrify(sql) + ' limit 1' return self.command(sql, find=True) def exists(self, table, _id): return self.find(table=table, where={'id': _id}, multi=False) def update(self, table, update, where): u_keys, u_place, u_values = self.convert_where_data(update) w_keys, w_place, w_values = self.convert_where_data(where) sql = SqlUtil.UPDATE_TABLE.format(table=table, update=u_place, where=w_place) sql = self.mogrify(sql, u_values + w_values) return self.command(sql) def delete(self, table, where): w_keys, w_place, w_values = self.convert_where_data(where) sql = SqlUtil.DELETE_TABLE.format(table=table, where=w_place) sql = self.mogrify(sql, w_values) return self.command(sql) def is_exists_table(self, table, database=None): database = database if database else self.database sql = f'show tables from {database};' table_list = self.command(sql, find=True) table_list = [table[f'Tables_in_{database}'] for table in table_list] return True if table in table_list else False def callproc(self, procname, args): """ @functions: callproc @params procname: 存储过程函数名 @params args: 存储过程参数 @return: True if excute sucessfully else False @summary: 执行存储过程 """ if self.check(): cur = self.connect.cursor() cur.callproc(procname, args) cur.close() self.connect.commit() @staticmethod def convert_insert_data(value_list): if not value_list: return [], [], [] keys = tuple(value_list[0].keys()) place = tuple(['%s'] * len(keys)) values = [] for item in value_list: value = list() for k in keys: value.append(item[k]) values.append(tuple(value)) return ', '.join(keys), ', '.join(place), values @staticmethod def convert_where_data(where, _and=' and '): """" where={'age': {'$gt': 20}, 'name':'Jim'} name=%s and age>%s """ keys = [] where_keys = [] where_vals = [] for k, v in where.items(): _k = k _v = v _join = '=' if isinstance(_v, dict): for kk, vv in _v.items(): _join = SqlUtil.where_dict.get(kk, '=') _v = vv where_keys.append(f'{_k}{_join}%s') where_vals.append(_v) keys.append(_k) place = _and.join(where_keys) values = tuple(where_vals) return ', '.join(keys), place, values
6. urls.txt
https://www.cnblogs.com/sui776265233/p/9719463.html