from pyalink. alink import *
Use one of the following commands to start using PyAlink:
- useLocalEnv(parallelism, flinkHome=None, config=None)
- useRemoteEnv(host, port, parallelism, flinkHome=None, localIp="localhost", config=None)
Call resetEnv() to reset environment and switch to another.
useLocalEnv( 2 , flinkHome= None , config= None )
JVM listening on 127.0.0.1:38737
MLEnv(benv=JavaObject id=o2, btenv=JavaObject id=o5, senv=JavaObject id=o3, stenv=JavaObject id=o6)
source = CsvSourceBatchOp( ) \
. setSchemaStr( "sepal_length double, sepal_width double, petal_length double, petal_width double, category string" ) \
. setFilePath( "https://alink-release.oss-cn-beijing.aliyuncs.com/data-files/iris.csv" )
res = source. select( [ "sepal_length" , "sepal_width" ] )
df = res. collectToDataframe( )
print ( df)
sepal_length sepal_width
0 5.0 3.2
1 6.6 3.0
2 5.4 3.9
3 5.0 2.3
4 5.5 3.5
.. ... ...
145 6.4 2.9
146 6.3 2.5
147 5.8 2.6
148 5.7 4.4
149 6.5 3.0
[150 rows x 2 columns]
df. head( 1 )
sepal_length
sepal_width
0
5.0
3.2
from pyalink. alink import *
import sys, os
resetEnv( )
useLocalEnv( 2 )
MLEnv(benv=JavaObject id=o178, btenv=JavaObject id=o181, senv=JavaObject id=o179, stenv=JavaObject id=o182)
schemaStr = "id string, click string, dt string, C1 string, banner_pos int , site_id string, \
site_domain string, site_category string, app_id string, app_domain string, \
app_category string, device_id string, device_ip string, device_model string, \
device_type string, device_conn_type string, C14 int , C15 int , C16 int , C17 int , \
C18 int , C19 int , C20 int , C21 int "
batchTrainDataFn = "http://alink-release.oss-cn-beijing.aliyuncs.com/data-files/avazu-small.csv"
trainBatchData = CsvSourceBatchOp( ) . setFilePath( batchTrainDataFn) \
. setSchemaStr( schemaStr) \
. setIgnoreFirstLine( True )
trainBatchData
<pyalink.alink.batch.common.batch_op_1.CsvSourceBatchOp at 0x7f3bf08abd10>
labelColName = "click"
vecColName = "vec"
nameHashFeatures = 30000
selectedColNames = [ "C1" , "banner_pos" , "site_category" , "app_domain" ,
"app_category" , "device_type" , "device_conn_type" ,
"C14" , "C15" , "C16" , "C17" , "C18" , "C19" , "C20" , "C21" ,
"site_id" , "site_domain" , "device_id" , "device_model" ]
categoryColNames = [ "C1" , "banner_pos" , "site_category" , "app_domain" ,
"app_category" , "device_type" , "device_conn_type" ,
"site_id" , "site_domain" , "device_id" , "device_model" ]
numericalColNames = [“C14”,“C15”,“C16”,“C17”,“C18”,“C19”,“C20”,“C21”]
wholeDataFile = "http://alink-release.oss-cn-beijing.aliyuncs.com/data-files/avazu-ctr-train-8M.csv"
data = CsvSourceStreamOp( ) \
. setFilePath( wholeDataFile) \
. setSchemaStr( schemaStr) \
. setIgnoreFirstLine( True ) ;
spliter = SplitStreamOp( ) . setFraction( 0.5 ) . linkFrom( data)
train_stream_data = spliter test_stream_data = spliter.getSideOutput(0)
feature_pipeline = Pipeline( ) \
. add( StandardScaler( ) \
. setSelectedCols( numericalColNames) ) \
. add( FeatureHasher( ) \
. setSelectedCols( selectedColNames) \
. setCategoricalCols( categoryColNames) \
. setOutputCol( vecColName) \
. setNumFeatures( nameHashFeatures) )
FEATURE_PIPELINE_MODEL_FILE = os. path. join( os. getcwd( ) , "feature_pipe_model.csv" )
feature_pipeline. fit( trainBatchData) . save( FEATURE_PIPELINE_MODEL_FILE) ;
BatchOperator. execute( ) ;
feature_pipelineModel = PipelineModel. load( FEATURE_PIPELINE_MODEL_FILE) ;
lr = LogisticRegressionTrainBatchOp( )
initModel = lr. setVectorCol( vecColName) \
. setLabelCol( labelColName) \
. setWithIntercept( True ) \
. setMaxIter( 10 ) \
. linkFrom( feature_pipelineModel. transform( trainBatchData) )
model = FtrlTrainStreamOp( initModel) \
. setVectorCol( vecColName) \
. setLabelCol( labelColName) \
. setWithIntercept( True ) \
. setAlpha( 0.1 ) \
. setBeta( 0.1 ) \
. setL1( 0.01 ) \
. setL2( 0.01 ) \
. setTimeInterval( 10 ) \
. setVectorSize( nameHashFeatures) \
. linkFrom( feature_pipelineModel. transform( train_stream_data) )
predResult = FtrlPredictStreamOp( initModel) \
. setVectorCol( vecColName) \
. setPredictionCol( "pred" ) \
. setReservedCols( [ labelColName] ) \
. setPredictionDetailCol( "details" ) \
. linkFrom( model, feature_pipelineModel. transform( test_stream_data) )
predResult. print ( key= "predResult" , refreshInterval = 30 , maxLimit= 20 )
'DataStream predResult : ( Updated on 2020-04-02 16:10:50, #items received: 196530 )'
click
pred
details
0
0
0
{"0":"0.893231946299782","1":"0.10676805370021...
1
1
0
{"0":"0.745142214488233","1":"0.25485778551176...
2
1
0
{"0":"0.765632769073235","1":"0.23436723092676...
3
0
0
{"0":"0.8113419833506623","1":"0.1886580166493...
4
0
0
{"0":"0.929177522686554","1":"0.07082247731344...
5
1
0
{"0":"0.7859656548828632","1":"0.2140343451171...
6
0
0
{"0":"0.8559101947601475","1":"0.1440898052398...
7
0
0
{"0":"0.9007309902743751","1":"0.0992690097256...
8
0
0
{"0":"0.7747776539114233","1":"0.2252223460885...
9
0
0
{"0":"0.7113793792746559","1":"0.2886206207253...
10
0
0
{"0":"0.8067465417336181","1":"0.1932534582663...
11
0
0
{"0":"0.9386237136980374","1":"0.0613762863019...
12
0
0
{"0":"0.9188682855816503","1":"0.0811317144183...
13
0
0
{"0":"0.6924579330847471","1":"0.3075420669152...
14
0
0
{"0":"0.7393514879088229","1":"0.2606485120911...
15
1
0
{"0":"0.8084501207999263","1":"0.1915498792000...
16
0
0
{"0":"0.949887889053032","1":"0.05011211094696...
17
0
0
{"0":"0.7547176580812045","1":"0.2452823419187...
18
0
0
{"0":"0.5494833642638153","1":"0.4505166357361...
19
0
0
{"0":"0.9240806476231835","1":"0.0759193523768...
EvalBinaryClassStreamOp( ) \
. setLabelCol( labelColName) \
. setPredictionCol( "pred" ) \
. setPredictionDetailCol( "details" ) \
. setTimeInterval( 10 ) \
. linkFrom( predResult) \
. link( JsonValueStreamOp( ) \
. setSelectedCol( "Data" ) \
. setReservedCols( [ "Statistics" ] ) \
. setOutputCols( [ "Accuracy" , "AUC" , "ConfusionMatrix" ] ) \
. setJsonPath( [ "$.Accuracy" , "$.AUC" , "$.ConfusionMatrix" ] ) ) \
. print ( key= "evaluation" , refreshInterval = 30 , maxLimit= 20 )
StreamOperator. execute( ) ;
'DataStream evaluation : ( Updated on 2020-04-02 16:11:00, #items received: 93 )'
Statistics
Accuracy
AUC
ConfusionMatrix
0
all
0.8208864412861355
0.7018136340935746
[[1740,1715],[32728,156114]]
1
window
0.8257520709199244
0.7048161223985424
[[37,53],[1146,5645]]
2
window
0.8118575185273726
0.7082768044875817
[[18,27],[760,3378]]
3
all
0.8206942182410424
0.7019777696316556
[[1758,1742],[33488,159492]]
4
window
0.8255558270217445
0.7054619366258229
[[21,13],[701,3358]]