python系列整理---爬虫架构简单代码实现

爬虫spider流程示意图

根据以上流程简单实现爬虫功能，只是一种简单的做事风格，实际更复杂，不做具体讨论。

1. 目录

2. engine.py

# encoding=utf-8
import os
from spider.scheduler import Scheduler

def read_urls(file_path):
    with open(file_path, 'r+', encoding='utf-8') as fp:
        lines = fp.readlines()
        return [line.strip() for line in lines if line.strip()]


def engine():
    path = os.path.dirname(__file__) + '/urls.txt'
    urls = read_urls(path)
    htmls = Scheduler.download(urls)
    data = Scheduler.analysis(htmls)
    Scheduler.storage(data)


if __name__ == '__main__':
    engine()

3. scheduler.py

# encoding=utf-8
from spider.downloader import Download
from spider.analysis import Analysis
from spider.storage import Storage

class Scheduler:

    def __init__(self):
        pass

    @staticmethod
    def download(urls):
        urls = urls if isinstance(urls, list) else [urls]
        htmls = []
        # 下载
        for url in urls:
            htmls.append((url, Download.get(url)))
        return htmls

    @staticmethod
    def analysis(_tuple):
        """[(url, html), (url, html)]"""
        #  解析
        data = []
        for url, html in _tuple:
            data.append(Analysis.parse(url, html))
        return data

    @staticmethod
    def storage(data):
        # 存储
        for params in data:
            Storage.storage(params)

4. downloader.py

# encoding=utf-8
import requests


class Download:

    """
    1. 高效爬取
    2. 常见反反爬虫手段
    3. 数据量的问题：并发, 分布式
    """

    def __init__(self):
        pass

    @staticmethod
    def get(url, headers={}):
        html = requests.get(url, headers=headers)
        return html.text

    @staticmethod
    def post(url, data={}, headers={}):
        html = requests.post(url, data=data, headers=headers)
        return html.text

    @staticmethod
    def get_headers(params):
        """..."""
        return params

5. storage.py

# encoding=utf-8
import hashlib
import pymysql


class Storage:

    table = 'spider'

    def __init__(self):
        pass

    @staticmethod
    def storage(params):
        """
        insert or update params
        :param params:
        :return:
        """
        sql_util = SqlUtil('127.0.0.1', 3306, 'root', '123456', 'mysql')
        _id = Storage.url2md5(url=params['url'])
        if sql_util.exists(Storage.table, _id):
            sql_util.update(Storage.table, where={'id': _id}, dict_value=params)
        else:
            sql_util.insert(Storage.table, params)

    @staticmethod
    def url2md5(url):
        if isinstance(url, str):
            url = url.encode('utf-8')
        m2 = hashlib.md5()
        m2.update(url)
        return m2.hexdigest()


class SqlUtil:

    where_dict = {
        '$in': 'IN', '$like': 'LIKE',
        '$eq': '=', '$reg': 'REGEXP',
        '$ge': '>=', '$le': '<=',
        '$gt': '>', '$lt': '<'
    }

    CREATE_DATABASE = 'CREATE DATABASE IF NOT EXISTS {database}'
    CREATE_TABLE = 'CREATE TABLE {table} ({keys}, UNIQUE INDEX({index}))'
    CREATE_INDEX = 'CREATE INDEX {index} ON {table}'
    INSERT_TABLE = 'INSERT INTO {table} ({keys}) VALUES ({values})'
    DELETE_TABLE = 'DELETE FROM {table} WHERE {where}'
    UPDATE_TABLE = 'UPDATE {table} SET {update} WHERE {where}'
    # UPDATE_TABLE_ALL = 'UPDATE {table} SET {update}'
    SELECT_TABLE = 'SELECT {keys} FROM {table} WHERE {where}'
    SELECT_TABLE_ALL = 'SELECT * FROM {table} WHERE {where}'
    # INSERT_TABLE_UPDATE = 'INSERT INTO {table} {keys} VALUES {values} ON DUPLICATE KEY UPDATE {update}'

    MAX_RETRY_TIMES = 3

    def __init__(self, ip, port, user, password, database, charset='utf-8'):
        """
        @functions：__init__
        @param： ip, 端口号, 用户名, 用户密码, 数据库编码
        @return：none
        @summary: 初始化类参数, 获取数据库对应的数据库名即对应参数
        """

        self.ip = ip
        self.port = port
        self.user = user
        self.password = password
        self.database = database
        self.charset = charset
        self.connect = None
        self.cursor = None
        self.connected = False
        self.connection()

    def check(self):
        try:
            self.connect.ping()
            return True
        except:
            print('Server not available!')
            self.close()
            return self.connection()


    def connection(self):
        """
        @functions：connect
        @param： none
        @return:  如果连接正常,返回True; 否则返回 False
        @summary: 连接数据库
        @error desover:
             解决error --- client does not support authentication
             mysql> alter user 'root'@'localhost' identified with mysql_native_password by '123456';
             Query OK, 0 rows affected (0.10 sec)
             mysql> flush privileges;
            Query OK, 0 rows affected (0.01 sec)
        """
        self.connect = pymysql.connect(host=self.ip, port=self.port,
                                       user=self.user, passwd=self.password,
                                       db=self.database)
        self.cursor = self.connect.cursor()
        self.connect.ping()
        return True

    def close(self):
        try:
            self.connect.close()
        except:
            pass

    def mogrify(self, sql, data=None):
        """sql防注入用法"""
        return self.cursor.mogrify(sql, data)

    def command(self, sql, data=[], find=False, _dict=True, _log=True):
        if _log:
            Logger.debug(sql)
        if self.check():
            if _dict:
                cur = self.connect.cursor(pymysql.cursors.DictCursor)
            else:
                cur = self.connect.cursor()
            if data:
                cur.executemany(sql, data)
            else:
                cur.execute(sql)
            results = cur.fetchall()
            cur.close()
            self.connect.commit()
            if find:
                return results
            return True

    def insert(self, table, value_list):
        if not value_list:
            return 0
        value_list = value_list if isinstance(value_list, list) else value_list
        keys, place, values = self.convert_insert_data(value_list)
        sql = self.mogrify(SqlUtil.INSERT_TABLE.format(table=table, keys=keys, values=place))
        return self.command(sql, data=values)

    def find(self, table, where={}, keys=[], multi=True):
        w_keys, w_place, w_values = self.convert_where_data(where)
        if keys:
            sql = SqlUtil.SELECT_TABLE.format(table=table, keys=keys, where=w_place)
        else:
            sql = self.mogrify(SqlUtil.SELECT_TABLE_ALL.format(table=table, where=w_place), w_values)
        sql = self.mogrify(sql) if not multi else self.mogrify(sql) + ' limit 1'
        return self.command(sql, find=True)

    def exists(self, table, _id):
        return self.find(table=table, where={'id': _id}, multi=False)

    def update(self, table, update, where):
        u_keys, u_place, u_values = self.convert_where_data(update)
        w_keys, w_place, w_values = self.convert_where_data(where)
        sql = SqlUtil.UPDATE_TABLE.format(table=table, update=u_place, where=w_place)
        sql = self.mogrify(sql, u_values + w_values)
        return self.command(sql)

    def delete(self, table, where):
        w_keys, w_place, w_values = self.convert_where_data(where)
        sql = SqlUtil.DELETE_TABLE.format(table=table, where=w_place)
        sql = self.mogrify(sql, w_values)
        return self.command(sql)

    def is_exists_table(self, table, database=None):
        database = database if database else self.database
        sql = f'show tables from {database};'
        table_list = self.command(sql, find=True)
        table_list = [table[f'Tables_in_{database}'] for table in table_list]
        return True if table in table_list else False

    def callproc(self, procname, args):
        """
        @functions: callproc
        @params procname: 存储过程函数名
        @params args: 存储过程参数
        @return: True if excute sucessfully else False
        @summary: 执行存储过程
        """
        if self.check():
            cur = self.connect.cursor()
            cur.callproc(procname, args)
            cur.close()
            self.connect.commit()

    @staticmethod
    def convert_insert_data(value_list):
        if not value_list:
            return [], [], []
        keys = tuple(value_list[0].keys())
        place = tuple(['%s'] * len(keys))
        values = []
        for item in value_list:
            value = list()
            for k in keys:
                value.append(item[k])
            values.append(tuple(value))
        return ', '.join(keys), ', '.join(place), values

    @staticmethod
    def convert_where_data(where, _and=' and '):
        """"
        where={'age': {'$gt': 20}, 'name':'Jim'}
        name=%s and age>%s
        """
        keys = []
        where_keys = []
        where_vals = []
        for k, v in where.items():
            _k = k
            _v = v
            _join = '='
            if isinstance(_v, dict):
                for kk, vv in _v.items():
                    _join = SqlUtil.where_dict.get(kk, '=')
                    _v = vv
            where_keys.append(f'{_k}{_join}%s')
            where_vals.append(_v)
            keys.append(_k)
        place = _and.join(where_keys)
        values = tuple(where_vals)
        return ', '.join(keys), place, values

6. urls.txt

https://www.cnblogs.com/sui776265233/p/9719463.html

python系列整理---爬虫架构简单代码实现

爬虫spider流程示意图

1. 目录

2. engine.py

3. scheduler.py

4. downloader.py

5. storage.py

6. urls.txt

猜你喜欢