为测试数据清洗工作的有效性写了一个产生测试数据的函数

import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import datetime 
from scipy import interpolate
from pandas import DataFrame,Series
#num_pi为要产生几个π的sin数据,num_ex为异常点的个数,num_gap为段缺失数据的个数,num_bk为单个缺失值的个数
def test_data_gen(num_pi,num_ex,num_gap,num_bk):
    if (num_pi>0) :
        num_point=72*num_pi
        x=np.linspace(0,3.14*num_pi,num_point)
        signal1=[(math.sin(i)+1) for i in x] #产生测试用的num_pi个sin数据
        noise=0.2*(np.random.rand(num_point)-0.5)
        signal1=signal1+noise#在sin数据上添加噪声
    else:
        print("Please input valid num_pi")
        return

    if (num_ex>0) :
        #随机添加异常值
        point_ex=[]
        for i in range(num_ex):
            point_ex.append(np.random.randint(0,len(signal1))) #异常值的位置
        for _ in point_ex:
            signal1[_]=signal1[_]*1.8
    else:
        pass
    if (num_gap>0) :    
        #随机添加段数据缺失
        longth_gap=np.random.randint(15)+5 #缺口大小5~20

        point_gap=[]   #缺口的位置
        for i in range(num_gap):
            point_gap.append(np.random.randint(num_point-20))

        for i in point_gap:
            for j in range(longth_gap):  
                signal1[i+j]=None
    else:
        pass
    if (num_bk>0) :        
        #随机添加单点缺失值
        point_break=[]
        for i in range(num_bk):
            point_break.append(np.random.randint(num_point))        
        for _ in point_break:
            signal1[_]=None
    else:
        pass
    #产生时间序列,每隔5分钟一个点
    date_need=[]
    start_dt = datetime.datetime(2017, 1, 1) 
    interval = datetime.timedelta(seconds=300) 
    for i in range(num_point): 
        date_need.append(start_dt + interval * i)

    df = DataFrame(signal1,index = date_need[0:num_point])
    df.to_excel('data_test.xlsx')        
    plt.figure(figsize=(10,5))
    plt.plot(signal1)
    plt.show()

    return signal1
test_data_gen(4,2,6,0)

png这里写图片描述

array([  9.16153402e-01,   9.54379407e-01,   1.09731508e+00,
         1.08006151e+00,   1.23044901e+00,   1.27691499e+00,
         1.32374816e+00,   1.27310324e+00,   1.28585976e+00,
         1.30158311e+00,   1.41862637e+00,   1.51293483e+00,
         1.46542362e+00,   1.50866021e+00,   1.57576178e+00,
         1.59855783e+00,   1.65355219e+00,   1.69197995e+00,
         1.68356642e+00,   1.79228898e+00,   1.70167661e+00,
         1.74825474e+00,   1.89430181e+00,   1.77041511e+00,
         1.80329786e+00,   1.85901009e+00,   1.87932731e+00,
         1.94802512e+00,   1.98010691e+00,   1.99723195e+00,
         2.04331069e+00,   2.00862185e+00,   1.91811615e+00,
         1.94157410e+00,   2.04888692e+00,   2.05231327e+00,
         1.95910786e+00,   2.08828786e+00,   2.06663645e+00,
         3.43317550e+00,   2.00197626e+00,   1.97676869e+00,
         1.86914580e+00,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,   1.70425963e+00,   1.76188529e+00,
         1.76557559e+00,   1.63184934e+00,   1.66148539e+00,
         1.62250085e+00,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,   1.14210725e+00,   1.23029710e+00,
         1.22109954e+00,   1.15263675e+00,   1.09320005e+00,
         9.56507128e-01,   1.02984472e+00,   8.34937581e-01,
         7.90033074e-01,   7.35435567e-01,   7.10364653e-01,
         7.39270870e-01,   7.00604767e-01,   5.86747134e-01,
         6.97862181e-01,   6.63542957e-01,   5.81515513e-01,
         5.80450727e-01,   3.85541118e-01,   4.44475765e-01,
         2.88093921e-01,   3.69923346e-01,   2.70215771e-01,
         2.05105006e-01,   1.85783290e-01,   2.46754791e-01,
         2.53761177e-01,   8.71627632e-02,   1.77697662e-01,
         6.02048174e-02,   8.31957566e-02,   3.99724990e-02,
         3.28240462e-03,   6.16072322e-02,   1.16086419e-02,
         3.27460625e-03,   1.19376608e-01,  -1.40544492e-02,
         3.83701427e-04,   7.86154263e-02,  -2.98355455e-02,
         6.38220160e-02,   9.06618096e-02,  -3.62089132e-02,
        -5.62450271e-02,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,   2.17241333e-01,   2.06728630e-01,
         2.17462397e-01,   1.59507418e-01,   2.78363880e-01,
         2.73258695e-01,   3.06463501e-01,   3.94220579e-01,
         5.00902489e-01,   3.91612197e-01,   4.55070436e-01,
         4.39161563e-01,   6.00936734e-01,   5.51967858e-01,
         5.97536536e-01,   6.62903418e-01,   7.76501626e-01,
         8.55124750e-01,   8.89612156e-01,   7.81178853e-01,
         9.77184582e-01,   9.16382328e-01,   9.82456695e-01,
         9.20410834e-01,   9.64181887e-01,   1.06073301e+00,
         1.15095579e+00,   1.18814609e+00,   1.21158390e+00,
         1.24563322e+00,   1.27416712e+00,   1.29402865e+00,
         1.34641975e+00,   1.43252962e+00,   1.51013044e+00,
         1.61631219e+00,   1.60735152e+00,   1.53355385e+00,
         1.66311479e+00,   1.68018342e+00,   1.65007061e+00,
         1.78013818e+00,   1.79670488e+00,   1.81007076e+00,
         1.74685062e+00,   1.82627683e+00,   1.93129591e+00,
         1.86592967e+00,   1.87453634e+00,   1.91195971e+00,
         1.86368228e+00,   1.99174036e+00,   1.99325218e+00,
         1.90830361e+00,   2.04982831e+00,   1.96508067e+00,
         2.07271133e+00,   1.89832375e+00,   2.07355193e+00,
         1.96777179e+00,   1.98768210e+00,   2.01406509e+00,
         2.04982411e+00,   2.00725271e+00,   1.89402920e+00,
         2.04896310e+00,   1.92510920e+00,   1.97991570e+00,
         1.91984596e+00,   1.88616008e+00,   1.82408361e+00,
         1.80510005e+00,   1.92727163e+00,   1.79424571e+00,
         1.69321125e+00,   1.79971060e+00,   1.77475016e+00,
         1.68809736e+00,   1.60391652e+00,   1.54975865e+00,
         1.56552817e+00,   1.48431861e+00,   1.52390521e+00,
         1.43952244e+00,   1.45504009e+00,   1.45715980e+00,
         1.26044021e+00,   1.34336588e+00,   1.24744027e+00,
         1.16712633e+00,   1.22905592e+00,   1.06003829e+00,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,   4.26128437e-01,
         5.33140231e-01,   4.59125201e-01,   4.97971158e-01,
         4.41713135e-01,   4.03272371e-01,   2.87368789e-01,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
        -1.70563867e-02,   1.42980337e-01,   6.21597065e-02,
         1.73589890e-02,  -1.82530766e-02,   2.95011364e-02,
         7.15305927e-02,  -1.49406228e-02,  -9.65255582e-02,
         1.08531246e-02,   3.59284949e-02,   7.22006398e-02,
         1.02626912e-01,   5.39881980e-02,   3.51501113e-02,
         9.47410685e-02,   1.21593352e-01,   6.33962286e-02,
         1.07925225e-01,   8.90447881e-02,   7.69046578e-02,
         5.99909834e-02,   7.59334045e-02,   2.12289125e-01,
         3.21005203e-01,   1.82030964e-01,   1.91950850e-01,
         2.54582516e-01,   4.04332967e-01,   4.36101435e-01,
         4.91973908e-01,   5.14079045e-01,   5.44343191e-01,
         4.69244104e-01,   4.80449321e-01,   6.97150590e-01,
         5.97848906e-01,   7.75669998e-01,   7.21173591e-01,
         7.98999899e-01,   8.45761521e-01,   9.00988314e-01,
         8.45971092e-01,   9.01918952e-01,   1.02322215e+00])

”’
若要产生不同时间起始点的序列,在后面加上自己想设定的时、分、秒
start_dt = datetime.datetime(2017, 1, 1, hour ,min ,second)
要模拟产生不同采样时间间隔序列,设置timedelta的值即可
interval = datetime.timedelta(seconds=300)
”’

猜你喜欢

转载自blog.csdn.net/elite666/article/details/80628410