scrapy 万能插入数据库 sql实现

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_37049050/article/details/84313620

在spider中创建Item 以及对应的操作,row 中的字段为数据库表中的字段,table为表名,在爬虫启动的时候初始化数据库链接,这块用了scrapy的信号机制,不了解的自己去查。

   class UniversalRow(Item):
    row = Field()
    table = Field()

class BDMonitor(Spider):
    name = "bd"
    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(BDMonitor, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.spider_opened, signals.spider_opened)



    def spider_opened(self, spider):

        self.data_conn = MySQLConnection(settings['DATA_DB']).get_conn()    





    def compose_item(self, table, item_tuple):
        item = UniversalRow()

        item['table'] = table
        item['row']['hash_label'] = hashlib.md5(item_string.encode('utf8')).hexdigest()
        item['row']['crawl_date'] = int(time.strftime('%Y%m%d', time.localtime()))
        item['row']['source'] = 'bd'
        item['row']['version'] = self.version+1
        return item

接下来是pipelines 中的代码


class LoadDBPipeline(object):
  
    def process_item(self, item, spider):

        self.conn = spider.data_conn
        self.dbsession = db(self.conn)

        try:
            with self.conn:
                self.conn.ping(reconnect=True)
                self.dbsession.Insert(item['table'], item['row'])
        except pymysql.Warning as w:
                logging.warning("Insert Warning:%s" % str(w))
        except pymysql.Error as e:
                logging.error("Insert Error:%s" % str(e))
#                 logging.error("Item: %s" % json.dumps(item, ensure_ascii=False))
                
        return item

settings中

ITEM_PIPELINES = {
    'jihuashu.pipelines.LoadDBPipeline': 10
}

最后就是插入数据库代码

class db:

    def __init__(self, conn):
        self.conn = conn
        
    def Insert(self, table, data):
    # insert data (pairs of column and value) into table
        strCol = ''
        strVal = ''
        
        for k in data.keys():
            strCol += ',`' + k + '`'
            if isinstance(data[k], list):
                dataValue = '|'.join(data[k])
            elif isinstance(data[k], dict):
                dataValue = json.dumps(data[k], ensure_ascii=False)
            elif not isinstance(data[k], str):
                dataValue = str(data[k])
            else:
                dataValue = data[k]
            
            strVal += ",'" + self.conn.escape_string(dataValue) + "'"

        qs = "INSERT INTO `%s` (%s) VALUES (%s)"  % (table, strCol[1:], strVal[1:])
        self.conn.query(qs)
            
        return self.conn.insert_id()

初始化传入连接对象,Insert 中table 为表名,data为数据字典。

猜你喜欢

转载自blog.csdn.net/qq_37049050/article/details/84313620