SPARK 모델 인 스 턴 스: 두 가지 방법 으로 랜 덤 숲 모델 (MLlib 와 ML) 을 실현 합 니 다.

11840 단어 데이터 발굴 spark

이 편 은 이어서 전편 이다.http://blog.csdn.net/dahunbi/article/details/72821915 공식 사례 는 훈련 에 사용 되 는 데이터 가 바로 로드 되 어 아무런 처리 도 하지 않 고 기 회 를 노 리 는 단점 이 있다.

    // Load and parse the data file.
    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")

실천 에서 우리 의 spark 는 모두 hadop 시스템 에 구 조 된 것 이 고 표 는 모두 HDFS 에 저장 되 어 있 습 니 다. 그러면 정상 적 인 추출 방식 은 hiveSQL 로 HiveContext 를 호출 하 는 것 입 니 다.전편 에서 언급 한 바 와 같이 두 개의 machine learning 라 이브 러 리 가 있 는데 하 나 는 ML 이 고 하 나 는 MLlib 이다.
ML 의 인 스 턴 스, pipeline 사용:

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.spark.ml.util.MLWritable
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, Path, FileSystem}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}


    val hc = new HiveContext(sc)
    import hc.implicits._
    //   HiveContext

    //    ，       label（0  1），        ，   ，            columns
    val data = hc.sql(s"""select  *  from database1.traindata_userprofile""".stripMargin)
    //  schema，     column name，drop（2）  1,2 ，      

    val schema = data.schema.map(f=>s"${f.name}").drop(2)

    //ML VectorAssembler   transformer,         string，              ，   age、income           userFea    ，      
    val assembler = new VectorAssembler().setInputCols(schema.toArray).setOutputCol("userFea")
    val userProfile = assembler.transform(data.na.fill(-1e9)).select("label","userFea")
    val data_train = userProfile.na.fill(-1e9)
    //      
    val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(userProfile)
    val featureIndexer = new VectorIndexer().setInputCol("userFea").setOutputCol("indexedFeatures").setMaxCategories(4).fit(userProfile)

    // Split the data into training and test sets (30% held out for testing).
    val Array(trainingData, testData) = userProfile.randomSplit(Array(0.7, 0.3))
    // Train a RandomForest model.
    val rf = new RandomForestClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")
    rf.setMaxBins(32).setMaxDepth(6).setNumTrees(90).setMinInstancesPerNode(4).setImpurity("gini")
    // Convert indexed labels back to original labels.
    val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)

    val pipeline = new Pipeline().setStages(Array(labelIndexer, featureIndexer, rf, labelConverter))

    // Train model. This also runs the indexers.
    val model = pipeline.fit(trainingData)
    println("training finished!!!!")
    // Make predictions.
    val predictions = model.transform(testData)

    // Select example rows to display.
    predictions.select("predictedLabel", "indexedLabel", "indexedFeatures").show(5)

    val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("accuracy")
    val accuracy = evaluator.evaluate(predictions)
    println("Test Error = " + (1.0 - accuracy))
}

MLlib 의 예, RDD 기반, ML 의 vector 에서 MLlib 의 vector 로 전환 하 는 과정 에 주의 하 십시오.

import java.io.{ObjectInputStream, ObjectOutputStream}


import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, Path, FileSystem}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
//import org.apache.spark.ml.linalg.Vector
import org.apache.spark.mllib.util.MLUtils

  var modelRF: RandomForestModel = null

  val hc = new HiveContext(sc)
  import hc.implicits._
  //         

  //    ，       label（0  1），        ，   ，            columns
  val data = hc.sql(s"""select  *  from database1.traindata_userprofile""".stripMargin)
  ////  schema，     column name，drop（2）  1,2 ，      
  val schema = data.schema.map(f=>s"${f.name}").drop(1)
  //ML VectorAssembler   transformer,         string，              ，   age、income           userFea    ，      
  val assembler = new VectorAssembler().setInputCols(schema.toArray).setOutputCol("userFea")
  val data2 = data.na.fill(-1e9)
  val userProfile = assembler.transform(data2).select("label","userFea")

  //    ： ML VectorAssembler   vector，           ， ML vector   MLlib vector，   MLlib        (   vector     ，   )
  val userProfile2 = MLUtils.convertVectorColumnsFromML(userProfile, "userFea")
  //      
  val rdd_Data : RDD[LabeledPoint]= userProfile2.rdd.map {
    x => val label = x.getAs[Double]("label")
      val userFea = x.getAs[Vector]("userFea")
      LabeledPoint(label,userFea)
  }
  //                 ， RF     
  val impurity = "gini"
  val featureSubsetStrategy = "auto"
  // Let The Algorithm Choose
  val categoricalFeaturesInfo = Map[Int, Int]()
  val iteration = 50
  val maxDepth = 9
  val numClasses = 2
  val maxBins = 32
  val numTrees = 70
  modelRF = RandomForest.trainClassifier(rdd_Data, numClasses, categoricalFeaturesInfo,
    numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)
  println("training finished!!!!")
  // Evaluate model on test instances and compute test error
  val labelAndPreds = userProfile2.rdd.map { x=>
    val label = x.getAs[Double]("label")
    val userFea = x.getAs[Vector]("userFea")
    val prediction = modelRF.predict(userFea)
    (label, prediction)
  }
  labelAndPreds.take(10).foreach(println)
  modelRF.save(sc, "/home/user/victorhuang/RFCModel_mllib")
  spark.stop()

이 내용에 흥미가 있습니까?

현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:

MySQL에서 머신러닝용 경마 데이터 준비

하지만 자유형 구축의 논리에도 시간이 걸리고 자유형 자체에도 시간이 걸리는 점 등을 고려해 이번에는 중앙경마를 주최하는 JRA가 제공하는 JRA-VAN 데이터 실험실의 무료 체험판에 첨부된 DVD 데이터로 MySQL...

텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.

CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.

걱정 없 는 모 바 일 앱 인터페이스

POLITO COMPUTER SCIENCE lab numero tre

좋은 웹페이지 즐겨찾기

개발자 우수 사이트 수집

개발자가 알아야 할 필수 사이트 100선 추천 우리는 당신을 위해 100개의 자주 사용하는 개발자 학습 사이트를 정리했습니다