pandas切割mongodb数据

import pandas as pd
import numpy as np
import missingno as mso
from pandas import Series,DataFrame
import pymongo
import pymysql
import time
#连接数据库
def select_mysql():
    db = pymysql.connect(host='localhost', port=3306, user='root', password='123456', db='test',
                         charset='utf8')
    cur = db.cursor()
    sql = "select cid from spider_category_task "
    # sql = "select cid from spider_category_task where cid = 110205"#测试
    cur.execute(sql)
    cid_list = cur.fetchall()
    db.close()
    for cid in cid_list:
        cid =cid[0]
        find_pymongo_data(cid)

def find_pymongo_data(category):
    a = 25
    '''
    传入一个类目名称,从mongo里面查出该类目的所有数据,并把
    它转换为pandas里面的DataFrame对象
    :param key_word: 类目名称
    :return:
    '''
    mongo_user = "root"
    mongo_pwd = "123456*"
    client = pymongo.MongoClient(host = 'localhost',port = 27017 )
    #数据库登录需要账号密码的话
    client.test.authenticate(mongo_user,mongo_pwd) # test代表数据库名称
    db = client['test']#获得数据库的句柄
    coll = db['20181022']#获得collection的句柄

    coll.create_index('category')#添加索引
    #加载数据
    # data = DataFrame(list(coll.find({'key_word':key_word})))#根据搜索的关键词拿出数据
    mongo_data = DataFrame(list(coll.find({'category':category})))
    client.close()
    try:
        if len(mongo_data) > 0:#查询结果不为空执行
            mongo_data = mongo_data[['price','sold']]#取里面的价格和销量
            data = mongo_data[~mongo_data['sold'].isin([''])]#取出里面的价格不为空的数据
            print(data)
            data = data.astype({'price': 'float', 'sold': 'int'})#价格转为浮点型.销量转为整数型
            splite_price(data,category,a)
    except Exception as e:
        pass


def splite_price(data,category, a):
    '''
    :param data: 从mongo里面获取DataFrame对象的数据
    :return:
    '''
    try:
        data['group'] = pd.qcut(data.price,a)#价格切割为25段,如果切不了,依次减一,递归继续切
        # 求每个价格分段对应的sales的和
        datas = data[['sold', 'group']].groupby('group').sum().reset_index()
    except Exception as e:
        if a > 0:
            a -= 1
            splite_price(data,category, a)
        else:
            print('一段也切不了')


    # print((data['group']))#二位数组


    price_group =datas['group']
    sold_group = datas['sold']
    price_group_list = []
    for i in price_group:
        i = (str(i)).replace('(','')
        i = i.replace(']','')
        price_list = i.split(',')
        price_group_list.append(price_list)
        # low_price = price_list[0]
        # high_price =price_list[1]
    data_list = zip(price_group_list,sold_group)#打包,两个列表合为一个列表(数据一一对应)
    data_list = (list(data_list))
    print(data_list)
#     insert_mysql(data_list,category)
#
def insert_mysql(data_list,category):
    '''
    插入到数据库里面
    :param data_list: 
    :param category: 
    :return: 
    '''
    # print(data_list)
    pass
    datetime = time.strftime('%Y%m%d')
    # data_list = splite_price()
    db = pymysql.connect(host = 'localhost',port = 3306,user = 'test',password = 'root*',db = 'test',charset = 'utf8')
    cur = db.cursor()
    for data in data_list:
        min_price = data[0][0]
        max_price = data[0][1]
        min_price = float(min_price)
        max_price = float(max_price)
        sold = data[1]
        # print(sold)
        sql = "insert into price_break_test (min_price,max_price,cate_id,sold,date_time) values(%s,%s,'%s','%s','%s')"%(min_price,max_price,category,sold,datetime)
        cur.execute(sql)
    db.commit()
    db.close()


select_mysql()

猜你喜欢

转载自blog.csdn.net/weixin_42341608/article/details/83338839
今日推荐