spark中调用逻辑回归

大家:

  好!spark中调用逻辑回归,涉及到机器学习和算法,不太好理解。

package Traffic


import java.io.PrintWriter
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
import org.apache.spark.mllib.linalg.SparseVector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.{SparkConf, SparkContext}


/**
  * Created by Administrator on 2017/10/18.
  */
class Recommder {


}


object Recommder {
  def main(args: Array[String]): Unit = {
   val conf=new SparkConf().setAppName("Recommder").setMaster("local[2]")
   val sc=new SparkContext(conf)


    //按照\t进行切分 , 得到lable和feture字符串  1 和 2
   val lines=sc.textFile("D://hbase//day27//000001_1").map(_.split("\t"))
    //把各个特征拿出来并进行去重
    // 如果用rdd, rdd[Arrar[String]]  用flatMap  Rdd[String]
    val line=lines.flatMap(_.drop(1)(0).split(";")).map(_.split(":")(0)).distinct()
//    println(line.collect().toBuffer)
    //生成一个字典映射表 (转成map是为了后面得到稀疏向量非0 下标用)
    val dict=line.zipWithIndex().collectAsMap()
    //构建labeldpoint 分为lable 和 vector
    val traindata=lines.map(x=>{
      //得到label 由于逻辑回归只支持0.0和1.0 这里需要进行转换一下
      val lable=x.take(1)(0) match{
        case "-1"=>0.0
        case "1"=>1.0
      }
      //获取当前样本每个特征在map中的下标,这些下标位置都是非0的,值统一都是1.0
     val index=x.drop(1)(0).split(";").map(_.split(":")(0)).map(
       fe=>{
         val ind=dict.get(fe) match{
           case Some(n)=>n
           case None =>0
         }
         ind.toInt
       })
      //创建一个所有元素是1.0的数组,作为稀疏向量非0元素集合
      val vector=new SparseVector(dict.size,index,Array.fill(index.length)(1.0))
      //创建labeledpoint
      new LabeledPoint(lable,vector)
    })
   //训练模型  用牛顿的逻辑回归 两个参数分别是迭代次数和步长(默认是0.1和1.0)
   val model=LogisticRegressionWithSGD.train(traindata,10,0.1)
   //得到权重
   val weight=model.weights.toArray
   //将原来的字典表反转,根据下标找到对应的特征字符串
   val map=dict.map(x=>{(x._2,x._1)})  //(特征下标,特征字符串)
   val pw=new PrintWriter("c://test//20171018")
    //输出
    for(i <-0 until weight.length){
      val featurename=map.get(i) match{
        case Some(x)=>x
        case None=>" "
      }
      val result=featurename+"\t"+weight(i)
      pw.write(result)
      pw.println()
    }
    pw.flush()
    pw.close()
  }
}

样例数据如下所示: 仅仅列举出了3行数据

1	Item.id,hitop_id166:1;Item.screen,screen18:1;Item.name,ch_name220:1;All,0:1;Item.author,author72:1;Item.sversion,sversion9:1;Item.network,x:1;Item.dgner,designer108:1;Item.icount,4:1;Item.stars,1.41:1;Item.comNum,11:1;Item.font,font4:1;Item.price,9290:1;Item.fsize,2:1;Item.ischarge,1:1;Item.downNum,1000:1;User.Item*Item,hitop_id889*hitop_id166:1;User.Item*Item,hitop_id46*hitop_id166:1;User.Item*Item,hitop_id985*hitop_id166:1;User.phone*Item,device_name1591*hitop_id166:1;User.pay*Item.price,pay_ability0*9290:1
1	Item.id,hitop_id166:1;Item.screen,screen18:1;Item.name,ch_name220:1;All,0:1;Item.author,author72:1;Item.sversion,sversion9:1;Item.network,x:1;Item.dgner,designer108:1;Item.icount,4:1;Item.stars,1.41:1;Item.comNum,11:1;Item.font,font4:1;Item.price,9290:1;Item.fsize,2:1;Item.ischarge,1:1;Item.downNum,1000:1;User.Item*Item,hitop_id370*hitop_id166:1;User.Item*Item,hitop_id801*hitop_id166:1;User.Item*Item,hitop_id583*hitop_id166:1;User.phone*Item,device_name1422*hitop_id166:1;User.pay*Item.price,pay_ability1*9290:1
1	Item.id,hitop_id166:1;Item.screen,screen18:1;Item.name,ch_name220:1;All,0:1;Item.author,author72:1;Item.sversion,sversion9:1;Item.network,x:1;Item.dgner,designer108:1;Item.icount,4:1;Item.stars,1.41:1;Item.comNum,11:1;Item.font,font4:1;Item.price,9290:1;Item.fsize,2:1;Item.ischarge,1:1;Item.downNum,1000:1;User.Item*Item,hitop_id300*hitop_id166:1;User.Item*Item,hitop_id968*hitop_id166:1;User.Item*Item,hitop_id400*hitop_id166:1;User.phone*Item,device_name3083*hitop_id166:1;User.pay*Item.price,pay_ability3*9290:1

猜你喜欢

转载自blog.csdn.net/zhaoxiangchong/article/details/78385502