大家:
好!spark中调用逻辑回归,涉及到机器学习和算法,不太好理解。
package Traffic
import java.io.PrintWriter
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
import org.apache.spark.mllib.linalg.SparseVector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by Administrator on 2017/10/18.
*/
class Recommder {
}
object Recommder {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setAppName("Recommder").setMaster("local[2]")
val sc=new SparkContext(conf)
//按照\t进行切分 , 得到lable和feture字符串 1 和 2
val lines=sc.textFile("D://hbase//day27//000001_1").map(_.split("\t"))
//把各个特征拿出来并进行去重
// 如果用rdd, rdd[Arrar[String]] 用flatMap Rdd[String]
val line=lines.flatMap(_.drop(1)(0).split(";")).map(_.split(":")(0)).distinct()
// println(line.collect().toBuffer)
//生成一个字典映射表 (转成map是为了后面得到稀疏向量非0 下标用)
val dict=line.zipWithIndex().collectAsMap()
//构建labeldpoint 分为lable 和 vector
val traindata=lines.map(x=>{
//得到label 由于逻辑回归只支持0.0和1.0 这里需要进行转换一下
val lable=x.take(1)(0) match{
case "-1"=>0.0
case "1"=>1.0
}
//获取当前样本每个特征在map中的下标,这些下标位置都是非0的,值统一都是1.0
val index=x.drop(1)(0).split(";").map(_.split(":")(0)).map(
fe=>{
val ind=dict.get(fe) match{
case Some(n)=>n
case None =>0
}
ind.toInt
})
//创建一个所有元素是1.0的数组,作为稀疏向量非0元素集合
val vector=new SparseVector(dict.size,index,Array.fill(index.length)(1.0))
//创建labeledpoint
new LabeledPoint(lable,vector)
})
//训练模型 用牛顿的逻辑回归 两个参数分别是迭代次数和步长(默认是0.1和1.0)
val model=LogisticRegressionWithSGD.train(traindata,10,0.1)
//得到权重
val weight=model.weights.toArray
//将原来的字典表反转,根据下标找到对应的特征字符串
val map=dict.map(x=>{(x._2,x._1)}) //(特征下标,特征字符串)
val pw=new PrintWriter("c://test//20171018")
//输出
for(i <-0 until weight.length){
val featurename=map.get(i) match{
case Some(x)=>x
case None=>" "
}
val result=featurename+"\t"+weight(i)
pw.write(result)
pw.println()
}
pw.flush()
pw.close()
}
}
样例数据如下所示: 仅仅列举出了3行数据
1 Item.id,hitop_id166:1;Item.screen,screen18:1;Item.name,ch_name220:1;All,0:1;Item.author,author72:1;Item.sversion,sversion9:1;Item.network,x:1;Item.dgner,designer108:1;Item.icount,4:1;Item.stars,1.41:1;Item.comNum,11:1;Item.font,font4:1;Item.price,9290:1;Item.fsize,2:1;Item.ischarge,1:1;Item.downNum,1000:1;User.Item*Item,hitop_id889*hitop_id166:1;User.Item*Item,hitop_id46*hitop_id166:1;User.Item*Item,hitop_id985*hitop_id166:1;User.phone*Item,device_name1591*hitop_id166:1;User.pay*Item.price,pay_ability0*9290:1
1 Item.id,hitop_id166:1;Item.screen,screen18:1;Item.name,ch_name220:1;All,0:1;Item.author,author72:1;Item.sversion,sversion9:1;Item.network,x:1;Item.dgner,designer108:1;Item.icount,4:1;Item.stars,1.41:1;Item.comNum,11:1;Item.font,font4:1;Item.price,9290:1;Item.fsize,2:1;Item.ischarge,1:1;Item.downNum,1000:1;User.Item*Item,hitop_id370*hitop_id166:1;User.Item*Item,hitop_id801*hitop_id166:1;User.Item*Item,hitop_id583*hitop_id166:1;User.phone*Item,device_name1422*hitop_id166:1;User.pay*Item.price,pay_ability1*9290:1
1 Item.id,hitop_id166:1;Item.screen,screen18:1;Item.name,ch_name220:1;All,0:1;Item.author,author72:1;Item.sversion,sversion9:1;Item.network,x:1;Item.dgner,designer108:1;Item.icount,4:1;Item.stars,1.41:1;Item.comNum,11:1;Item.font,font4:1;Item.price,9290:1;Item.fsize,2:1;Item.ischarge,1:1;Item.downNum,1000:1;User.Item*Item,hitop_id300*hitop_id166:1;User.Item*Item,hitop_id968*hitop_id166:1;User.Item*Item,hitop_id400*hitop_id166:1;User.phone*Item,device_name3083*hitop_id166:1;User.pay*Item.price,pay_ability3*9290:1