제8 과 IDE 개발 Spark 프로그램 사용

1 단계: Spark streaming、spark sql, kafka, spark 커 널 원리 (대형 프로젝트 경험 이 있어 야 함);
두 번 째 단계: spark 가 운행 하 는 각종 환경, 각종 고장 해결, 성능 최적화 (spark 커 널, 운행 원리 에 정통);
세 번 째 단계: 흐름 처리, 기계 학습 이 선두 이 므 로 먼저 앞의 두 단계 의 내용 을 파악 해 야 한다.
왕가 림 선생님 의 기초 설명 에 따라 실전 을 중시 하고 spark 고수 가 되 어 빅 데이터 의 숲 을 자랑 합 니 다!
제1 부분 학습 노트
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
/**
 *  8 :        IDE  Spark  
 * 
 *    :     Scala   Scala 2.10.x
 *    :  Spark 1.4.1 jar    
 *    :     Spark Jar      Eclipse  Jar  spark-assembly-1.4.1-hadoop2.6.0.jar
 *    :  Scala   
 *    : class  object   main    
 * 
 * 
 *   Scala       Spark WordCount  
 */
object WordCount {
  def main(args: Array[String]): Unit = {
    /**
     *  1 :  Spark     SparkConf,  spark          
     *      setMaster         spark   Master URL,     
     * local,   Spark       ,              (    1G      )
     */
    val conf = new SparkConf();//  SparkConf  
    conf.setAppName("Wow,My frist Spark App!")//         ,                
    conf.setMaster("local") //  ,       ,     Spark  
    /**
     *  2 :  SparkContext  
     * SparkContext Spark           ,     Scala、java、Python、R       Sparkcontext
     * SparkContext    :   Spark              ,  DAGScheduler、TaskScheduler、SchedulerBackend
     *       Spark   Master     
     * SparkContext   Spark                
     */
    val sc = new SparkContext(conf)//  SparkContext  ,    SparkConf     Spark            
    /**
     *  3 :         (HDFS、HBase、Local FS、DB、S3 )  SparkContext   RDD
     * RDD          :         (  HDFS)、  Scala  、    RDD  
     *     RDD        Partitions,     Partition       Task     
     */
    //    val lines : RDD[String] = sc.textFile("hdfs://127.0.0.1:9000/cm_news_dev/leihui/test/wordcount/input/worddata.txt",1)
    //val lines = sc.textFile("hdfs://127.0.0.1:9000/cm_news_dev/leihui/test/wordcount/input/worddata.txt",1)//       1    
    val lines = sc.textFile("D:\\data\\2.txt",1)//       1    
    /**
     *  4 :    RDD  Transformation     ,  map、filter         ,          
     *   4.1 :                
     */
    val words = lines.flatMap { line => line.split("[ ]") } //                          flat            
    /**
     *  4 :    RDD  Transformation     ,  map、filter         ,          
     *   4.2 :                   1,   word => (word,1)
     */
    val pairs = words.map { word => (word,1) }//(word,1)   tuple
    /**
     *  4 :    RDD  Transformation     ,  map、filter         ,          
     *   4.3 :          1                    
     */
    val wordCounts = pairs.reduceByKey(_+_) //    key,  value   (  local cluster     reducer)
    wordCounts.foreach(wordNumberPair => println(wordNumberPair._1 + " : " + wordNumberPair._2))
    
    sc.stop()
    
    
    /**
     * Spark prebult with Hadoop
     * Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
     */
  }
}

제2 부분 작업
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
/**
 *  8 :        IDE  Spark  
 * 
 *    :     Scala   Scala 2.10.x
 *    :  Spark 1.4.1 jar    
 *    :     Spark Jar      Eclipse  Jar  spark-assembly-1.4.1-hadoop2.6.0.jar
 *    :  Scala   
 *    : class  object   main    
 * 
 * 
 *   Scala       Spark WordCount  
 */
object WordCount_Cluster {
  def main(args: Array[String]): Unit = {
    /**
     *  1 :  Spark     SparkConf,  spark          
     *      setMaster         spark   Master URL,     
     * local,   Spark       ,              (    
     * 1G      )
     */
    val conf = new SparkConf();//  SparkConf  
    conf.setAppName("Wow,My frist Spark App!")//         ,                
    //conf.setMaster("local") //  ,       ,     Spark  
    /**
     *  2 :  SparkContext  
     * SparkContext Spark           ,     Scala、java、Python、R       SparkContext
     * SparkContext    :   Spark              ,  DAGScheduler、TaskScheduler、SchedulerBackend
     *       Spark   Master     
     * Sparkcontext   Spark                
     */
    val sc = new SparkContext(conf)//  SparkContext  ,    SparkConf     Spark            
    /**
     *  3 :         (HDFS、HBase、Local FS、DB、S3 )  SparkContext   RDD
     * RDD          :         (  HDFS)、  Scala  、    RDD  
     *     RDD        Partitions,     Partition       Task     
     */
    //    val lines : RDD[String] = sc.textFile("hdfs://127.0.0.1:9000/cm_news_dev/leihui/test/wordcount/input/worddata.txt",1)
    //val lines = sc.textFile("hdfs://127.0.0.1:9000/cm_news_dev/leihui/test/wordcount/input/worddata.txt",1)//  HDFS         Partions
    val lines = sc.textFile("hdfs://127.0.0.1:9000/cm_news_dev/leihui/test/wordcount/input/worddata.txt")//  HDFS         Partions
    /**
     *  4 :    RDD  Transformation     ,  map、filter         ,          
     *   4.1 :                
     */
    val words = lines.flatMap { line => line.split(" ") } //              
    /**
     *  4 :    RDD  Transformation     ,  map、filter         ,          
     *   4.2 :                   1,   word => (word,1)
     */
    val pairs = words.map { word => (word,1) }//(word,1)   tuple
    /**
     *  4 :    RDD  Transformation     ,  map、filter         ,          
     *   4.3 :          1                    
     */
    val wordCounts = pairs.reduceByKey(_+_) //    key,  value   (  local cluster     reducer)
    wordCounts.collect().foreach(wordNumberPair => println(wordNumberPair._1 + " : " + wordNumberPair._2))
    
    sc.stop()
    
    
    /**
     * Spark prebult with Hadoop
     * Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
     */
    /**
     *    8   : Eclipse               
     */
  }

좋은 웹페이지 즐겨찾기