How to Write & Read Apache ORC file with Java

11537 단어
Just record how to write & Read Apache ORC file with Java.
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.sql.Timestamp;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDate;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;

public class ORCReader {

    private static String testWrite() throws IOException, ParseException {
        Configuration conf = new Configuration();
        TypeDescription schema = TypeDescription.createStruct()
                .addField("a-string", TypeDescription.createString())
                .addField("b-date", TypeDescription.createDate())
                .addField("c-double", TypeDescription.createDouble())
                .addField("d-time", TypeDescription.createTimestamp())
                .addField("e-string", TypeDescription.createString());
//      TypeDescription schema = TypeDescription.fromString("struct");

        String orcFile = System.getProperty("java.io.tmpdir") + File.separator + "orc-test-" + System.currentTimeMillis() + ".orc";

        if(Files.exists(Paths.get(orcFile))) {
            Files.delete(Paths.get(orcFile));
        }

        Writer writer = OrcFile.createWriter(new Path(orcFile),
                OrcFile.writerOptions(conf)
                        .setSchema(schema));

        VectorizedRowBatch batch = schema.createRowBatch();
        BytesColumnVector a = (BytesColumnVector) batch.cols[0];
        LongColumnVector b = (LongColumnVector) batch.cols[1];
        DoubleColumnVector c = (DoubleColumnVector) batch.cols[2];
        TimestampColumnVector d = (TimestampColumnVector) batch.cols[3];
        BytesColumnVector e = (BytesColumnVector) batch.cols[4];
        for(int r=0; r < 500; ++r) {
            int row = batch.size++;
            a.setVal(row, ("a-" + r).getBytes());
            b.vector[row] = LocalDate.parse("2019-07-22").minusDays(r).toEpochDay();
            c.vector[row] = Double.valueOf(r);
            d.set(row, new Timestamp(new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ").parse("2019-07-22T01:12:37.758-0500").getTime()));
            e.setVal(row, ("e-" + r).getBytes());
            // If the batch is full, write it out and start over.
            if (batch.size == batch.getMaxSize()) {
                writer.addRowBatch(batch);
                batch.reset();
            }
        }
        if (batch.size != 0) {
            writer.addRowBatch(batch);
            batch.reset();
        }
        writer.close();

        return orcFile;
    }

    private static void readTest(String filePath) throws IOException {
        Configuration conf = new Configuration();
        conf.setAllowNullValueProperties(true);
        Reader reader = OrcFile.createReader(new Path(filePath),
                OrcFile.readerOptions(conf));

        RecordReader rows = reader.rows();
        VectorizedRowBatch batch = reader.getSchema().createRowBatch();
        System.out.println("schema:" + reader.getSchema());
        System.out.println("numCols:" + batch.numCols);
        ColumnVector.Type[] colsMap = new ColumnVector.Type[batch.numCols];
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
        while (rows.nextBatch(batch)) {
            BytesColumnVector cols0 = (BytesColumnVector) batch.cols[0];
            LongColumnVector cols1 = (LongColumnVector) batch.cols[1];
            DoubleColumnVector cols2 = (DoubleColumnVector) batch.cols[2];
            TimestampColumnVector cols3 = (TimestampColumnVector) batch.cols[3];
            BytesColumnVector cols4 = (BytesColumnVector) batch.cols[4];


            for(int cols = 0; cols < batch.numCols; cols++) {
                System.out.println("args = [" + batch.cols[cols].type + "]");
            }

            for(int r=0; r < batch.size; r++) {
                String a = cols0.toString(r);
//        System.out.println("date:" + cols1.vector[r]);
//              String date = new SimpleDateFormat("yyyy-MM-dd").format(new Date(cols1.vector[r]));
//              String value2 = String.valueOf(cols1.vector[r]);
                String b = LocalDate.ofEpochDay(cols1.vector[r]).atStartOfDay(ZoneOffset.UTC).format(DateTimeFormatter.ofPattern("yyyy-MM-dd"));
//              System.out.println("date:" + date);

                Double c = cols2.vector[r];
                Timestamp d = cols3.asScratchTimestamp(r);
                String e = cols4.toString(r);

//              String timeV = new String(insertTime.vector[r], insertTime.start[r], insertTime.length[r]);
//              String value2 = jobId.length[r] == 0 ? "": new String(jobId.vector[r], jobId.start[r], jobId.length[r]);
                System.out.println(a + ", " + b + ", " + c + ", " + simpleDateFormat.format(d) + ", " + e);
            }
        }
        rows.close();
    }

    public static void main(String [ ] args) throws IOException, ParseException
    {
        String filePath = testWrite();
        readTest(filePath);
    }
}


Lookup the orc file with orc-tools.
  • Convert csv/json to ORC, it didn't support the date type. we can use orc-tools to check the data or meta
  • Lookup the data information
  • java -jar ~/software/orc-tools-1.5.6-uber.jar data /var/folders/ch/9l8p3shx5rdbzyh6dr17ljnm0000gp/T/orc-test-1564154755670.orc
    log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
    log4j:WARN Please initialize the log4j system properly.
    log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
    Processing data file /var/folders/ch/9l8p3shx5rdbzyh6dr17ljnm0000gp/T/orc-test-1564154755670.orc [length: 3118]
    {"a-string":"a-0","b-date":"2019-07-22","c-double":0,"d-time":"2019-07-22 01:12:37.758","e-string":"e-0"}
    {"a-string":"a-1","b-date":"2019-07-21","c-double":1,"d-time":"2019-07-22 01:12:37.758","e-string":"e-1"}
    {"a-string":"a-2","b-date":"2019-07-20","c-double":2,"d-time":"2019-07-22 01:12:37.758","e-string":"e-2"}
    {"a-string":"a-3","b-date":"2019-07-19","c-double":3,"d-time":"2019-07-22 01:12:37.758","e-string":"e-3"}
    {"a-string":"a-4","b-date":"2019-07-18","c-double":4,"d-time":"2019-07-22 01:12:37.758","e-string":"e-4"}
    {"a-string":"a-5","b-date":"2019-07-17","c-double":5,"d-time":"2019-07-22 01:12:37.758","e-string":"e-5"}
    {"a-string":"a-6","b-date":"2019-07-16","c-double":6,"d-time":"2019-07-22 01:12:37.758","e-string":"e-6"}
    {"a-string":"a-7","b-date":"2019-07-15","c-double":7,"d-time":"2019-07-22 01:12:37.758","e-string":"e-7"}
    {"a-string":"a-8","b-date":"2019-07-14","c-double":8,"d-time":"2019-07-22 01:12:37.758","e-string":"e-8"}
    {"a-string":"a-9","b-date":"2019-07-13","c-double":9,"d-time":"2019-07-22 01:12:37.758","e-string":"e-9"}
    
  • Lookup the meta information.
  • java -jar ~/software/orc-tools-1.5.6-uber.jar meta /var/folders/ch/9l8p3shx5rdbzyh6dr17ljnm0000gp/T/orc-test-1564154755670.orc
    log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
    log4j:WARN Please initialize the log4j system properly.
    log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
    Processing data file /var/folders/ch/9l8p3shx5rdbzyh6dr17ljnm0000gp/T/orc-test-1564154755670.orc [length: 3118]
    Structure for /var/folders/ch/9l8p3shx5rdbzyh6dr17ljnm0000gp/T/orc-test-1564154755670.orc
    File Version: 0.12 with ORC_517
    Rows: 500
    Compression: ZLIB
    Compression size: 262144
    Type: struct
    
    Stripe Statistics:
      Stripe 1:
        Column 0: count: 500 hasNull: false
        Column 1: count: 500 hasNull: false bytesOnDisk: 904 min: a-0 max: a-99 sum: 2390
        Column 2: count: 500 hasNull: false bytesOnDisk: 9 min: 2018-03-10 max: 2019-07-22
        Column 3: count: 500 hasNull: false bytesOnDisk: 667 min: 0.0 max: 499.0 sum: 124750.0
        Column 4: count: 500 hasNull: false bytesOnDisk: 19 min: 2019-07-22 01:12:37.758 max: 2019-07-22 01:12:37.758
        Column 5: count: 500 hasNull: false bytesOnDisk: 904 min: e-0 max: e-99 sum: 2390
    
    File Statistics:
      Column 0: count: 500 hasNull: false
      Column 1: count: 500 hasNull: false bytesOnDisk: 904 min: a-0 max: a-99 sum: 2390
      Column 2: count: 500 hasNull: false bytesOnDisk: 9 min: 2018-03-10 max: 2019-07-22
      Column 3: count: 500 hasNull: false bytesOnDisk: 667 min: 0.0 max: 499.0 sum: 124750.0
      Column 4: count: 500 hasNull: false bytesOnDisk: 19 min: 2019-07-22 01:12:37.758 max: 2019-07-22 01:12:37.758
      Column 5: count: 500 hasNull: false bytesOnDisk: 904 min: e-0 max: e-99 sum: 2390
    
    Stripes:
      Stripe: offset: 3 data: 2503 rows: 500 tail: 104 index: 173
        Stream: column 0 section ROW_INDEX start: 3 length 12
        Stream: column 1 section ROW_INDEX start: 15 length 34
        Stream: column 2 section ROW_INDEX start: 49 length 27
        Stream: column 3 section ROW_INDEX start: 76 length 36
        Stream: column 4 section ROW_INDEX start: 112 length 30
        Stream: column 5 section ROW_INDEX start: 142 length 34
        Stream: column 1 section DATA start: 176 length 891
        Stream: column 1 section LENGTH start: 1067 length 13
        Stream: column 2 section DATA start: 1080 length 9
        Stream: column 3 section DATA start: 1089 length 667
        Stream: column 4 section DATA start: 1756 length 11
        Stream: column 4 section SECONDARY start: 1767 length 8
        Stream: column 5 section DATA start: 1775 length 891
        Stream: column 5 section LENGTH start: 2666 length 13
        Encoding column 0: DIRECT
        Encoding column 1: DIRECT_V2
        Encoding column 2: DIRECT_V2
        Encoding column 3: DIRECT
        Encoding column 4: DIRECT_V2
        Encoding column 5: DIRECT_V2
    
    File length: 3118 bytes
    Padding length: 0 bytes
    Padding ratio: 0%
    

    Refer:
  • ORC Tools https://orc.apache.org/docs/java-tools.html
  • ORC Tools Source Code https://github.com/apache/orc/tree/master/java/tools
  • 좋은 웹페이지 즐겨찾기