Spark - structuredStream confluent - kafka - avro 데이터 읽 기

10623 단어 Spark
1. 데이터 준비
 kafka - rest 를 통 해 kafka - avro 데 이 터 를 기록 합 니 다.
public class Test {

     public static void main(String args[]){
         String url = "http://node9:8082/topics/ztwo";
         int x=1;
         while (true){

             Random random =new Random();
             int i = random.nextInt();
             String json;
             if (i%2==0) {
               json="{\"value_schema\": \"{\\\"type\\\": \\\"record\\\", \\\"name\\\": \\\"news_doc\\\", \\\"fields\\\": [{\\\"name\\\": \\\"name\\\", \\\"type\\\": \\\"string\\\"},{\\\"name\\\": \\\"time\\\", \\\"type\\\": \\\"long\\\"}]}\", \"records\": [{\"value\": {\"name\": \"one\",\"time\":1553069910680}}]}";
             }else {
                 json="{\"value_schema\": \"{\\\"type\\\": \\\"record\\\", \\\"name\\\": \\\"news_doc\\\", \\\"fields\\\": [{\\\"name\\\": \\\"name\\\", \\\"type\\\": \\\"string\\\"},{\\\"name\\\": \\\"time\\\", \\\"type\\\": \\\"long\\\"}]}\", \"records\": [{\"value\": {\"name\": \"two\",\"time\":1553069910680}}]}";
             }
             x++;
             System.out.println(x);
            HttpRequest.sendPost(url,json);

             try {
                 Thread.sleep(3000);
             } catch (InterruptedException e) {
                 e.printStackTrace();
             }

         }

      }




}




      public static String sendPost(String url ,String data) {
          HttpClient httpClient = new DefaultHttpClient();
          //JSONObject json = new JSONObject(data);
          HttpPost post = new HttpPost(url);
          post.setHeader("Content-type", "application/vnd.kafka.avro.v1+json");
          StringEntity entity = new StringEntity(data, Charset.forName("UTF-8"));
          entity.setContentEncoding("UTF-8");
          //   Json       
          entity.setContentType("application/json");
          post.setEntity(entity);


          HttpResponse response = null;
          try {
              response = httpClient.execute(post);
              String enStr = EntityUtils.toString(response.getEntity(), "utf-8");

              return enStr;
          } catch (IOException e) {
              e.printStackTrace();
              return  e.getMessage();
          }


      }











2, spark 읽 기 분석 데이터
 
package cn

import io.confluent.kafka.schemaregistry.client.{CachedSchemaRegistryClient, SchemaRegistryClient}
import io.confluent.kafka.serializers.AbstractKafkaAvroDeserializer
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.avro._


/**
 * Hello world!
 *
 */
object App {

  private var schemaRegistryClient: SchemaRegistryClient = _
  private var kafkaAvroDeserializer: AvroDeserializer = _

  def getTopicSchema(topic: String) = {
    schemaRegistryClient.getLatestSchemaMetadata(topic + "-value").getSchema
  }
  def avroSchemaToSparkSchema(avroSchema: String) = {
    SchemaConverters.toSqlType(new Schema.Parser().parse(avroSchema))
  }


  def main(args: Array[String]): Unit = {

    val  conf= new SparkConf()
      .setAppName("kafka-structured").set("spark.testing.memory","2147480000")
      .setMaster("local[*]");

    val spark = SparkSession.builder()
      .config(conf)
      .getOrCreate()

    val bootstrapServers ="node9:9092"
    val topic =  "ztwo"
    val schemaRegistryUrl = "http://node9:8081"
    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.WARN)
    Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.WARN)

    consumeAvro(spark, bootstrapServers, topic, schemaRegistryUrl)

    spark.stop()
  }

  private def consumeAvro(spark: SparkSession, bootstrapServers: String, topic: String, schemaRegistryUrl: String): Unit = {
    import spark.implicits._

    schemaRegistryClient = new CachedSchemaRegistryClient(schemaRegistryUrl, 128)
    kafkaAvroDeserializer = new AvroDeserializer(schemaRegistryClient)

    spark.udf.register("deserialize", (bytes: Array[Byte]) =>
      kafkaAvroDeserializer.deserialize(bytes)
    )

    val rawDf = spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", bootstrapServers)
      .option("subscribe", topic)
      .option("startingOffsets", "earliest")
      .option("group.id","1")
      .load()

    import org.apache.spark.sql.functions._
    val jsonDf = rawDf.select(callUDF("deserialize", 'value).as("value"))

    val dfValueSchema = {
      val rawSchema = getTopicSchema(topic)
      avroSchemaToSparkSchema(rawSchema)
    }

    val parsedDf = jsonDf.select(from_json('value, dfValueSchema.dataType).alias("value")
    ).select($"value.*")

    parsedDf.createTempView(topic)


    val output=spark.sql("select count(*) from  "+topic+" group by name")


    output.writeStream
      .format("console")
      .outputMode("complete")
      //.outputMode("append ")
      .start()
      .awaitTermination()
  }


  class AvroDeserializer extends AbstractKafkaAvroDeserializer {
    def this(client: SchemaRegistryClient) {
      this()
      this.schemaRegistry = client
    }

    override def deserialize(bytes: Array[Byte]): String = {
      val value = super.deserialize(bytes)
      value match {
        case str: String =>
          str
        case _ =>
          val genericRecord = value.asInstanceOf[GenericRecord]
          if (genericRecord==null) {
            //       
             null
          }else{
            genericRecord.toString

          }
      }
    }
  }

}

 

  4.0.0
  cn.golaxy
  kafka-spark-connector
  1.0-SNAPSHOT
  2008
  
    2.12.7
  

  
    
      scala-tools.org
      Scala-Tools Maven2 Repository
      http://scala-tools.org/repo-releases
    
    
      confluent
      http://packages.confluent.io/maven/
    
  

  
    
      scala-tools.org
      Scala-Tools Maven2 Repository
      http://scala-tools.org/repo-releases
    
  

  

    
      org.apache.spark
      spark-core_2.12
      2.4.0
    

    
    
      org.apache.spark
      spark-sql_2.12
      2.4.0
    
    
    
      org.apache.spark
      spark-streaming_2.12
      2.4.0
    

    
    
      org.apache.kafka
      kafka_2.12
      2.1.0
    

    
    
      org.apache.spark
      spark-streaming-kafka-0-10_2.12
      2.4.0
    
    
      org.apache.spark
      spark-sql-kafka-0-10_2.12
      2.4.0
    


    
      com.alibaba
      fastjson
      1.2.38
    

    
      com.thoughtworks.paranamer
      paranamer
      2.8
    

    
      org.apache.spark
      spark-avro_2.12
      2.4.0
    

    
      com.databricks
      spark-avro_2.11
      4.0.0
    


    
      io.confluent
      kafka-avro-serializer
      3.2.0
      
        
          org.apache.avro
          avro
        
      
    


    
      junit
      junit
      4.4
      test
    
    
      org.specs
      specs
      1.2.5
      test
    
  

  
    src/main/scala
    src/test/scala
    
      
        org.scala-tools
        maven-scala-plugin
        
          
            
              compile
              testCompile
            
          
        
        
          ${scala.version}
          
            -target:jvm-1.8
          
        
      

    
  



 
 

좋은 웹페이지 즐겨찾기