Data deduplication
16880 단어 Data
오후 내내 밤늦게까지 일했더니 슬프다...
프로그램에 빈틈이 있어야 한다. 단지 단일 기기에서 작은 데이터 파일을 테스트해서 통과할 수 있을 뿐이다.지적을 환영합니다~~
코드는 다음과 같습니다.
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
public class TextPair implements WritableComparable<TextPair> {
public Text first;
public Text second;
public TextPair(){
this.first=new Text();
this.second=new Text();
}
public TextPair(Text first, Text second) {
//super();
this.first = first;
this.second = second;
}
public TextPair(String first,String second){
this.first=new Text(first);
this.second=new Text(second);
}
public Text getFirst() {
return first;
}
public void setFirst(Text first) {
this.first = first;
}
public Text getSecond() {
return second;
}
public void setSecond(Text second) {
this.second = second;
}
public void set(Text first,Text second){
this.first=first;
this.second=second;
}
@Override
public int hashCode() {
// TODO Auto-generated method stub
return first.hashCode()*163+second.hashCode();
}
@Override
public boolean equals(Object obj) {
// TODO Auto-generated method stub
if(obj instanceof TextPair){
TextPair tp=(TextPair)obj;
return first.equals(tp.getFirst())&&second.equals(tp.getSecond());
}
return false;
}
@Override
public String toString() {
// TODO Auto-generated method stub
return first+"\t"+second;
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
first.readFields(in);
second.readFields(in);
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
first.write(out);
second.write(out);
}
@Override
public int compareTo(TextPair tp) {
// TODO Auto-generated method stub
int cmp=first.compareTo(tp.getFirst());
if(cmp!=0)
return cmp;
return second.compareTo(tp.getSecond());
}
}
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class FirstPartitioner extends Partitioner<TextPair, Text> {
@Override
public int getPartition(TextPair key, Text value, int numPartitions) {
// TODO Auto-generated method stub
return Math.abs(key.getFirst().hashCode()&Integer.MAX_VALUE)%numPartitions;
}
}
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class GroupComparator extends WritableComparator {
public GroupComparator() {
super(TextPair.class, true);
// TODO Auto-generated constructor stub
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
// TODO Auto-generated method stub
TextPair p1=(TextPair)a;
TextPair p2=(TextPair)b;
return p1.getFirst().compareTo(p2.getFirst());
}
}
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Mapper;
public class DatadeduplicationMapper extends Mapper<LongWritable,Text,TextPair,Text>{
@Override
protected void map(LongWritable key, Text value,
org.apache.hadoop.mapreduce.Mapper.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
System.out.println("mapper");
String line=value.toString();
StringTokenizer token=new StringTokenizer(line);
String []str=new String[2];
int i=0;
TextPair tp=new TextPair();
while(token.hasMoreTokens()){
str[i]=token.nextToken();
i++;
}
tp.set(new Text(str[0]), new Text(str[1]));
System.out.println(tp);
System.out.println(str[1]);
context.write(tp, new Text(str[1]));
}
}
import java.awt.List;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.Vector;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class DatadeduplicationReducer extends
Reducer<TextPair, Text, Text, Text> {
@Override
protected void reduce(TextPair key, Iterable<Text> values,
Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
System.out.println("reducer");
Set set=new HashSet();
Vector<String>list=new Vector();
for(Text value:values){
//set.add(value.toString());
if(!list.contains(value.toString())){
list.add(value.toString());
}
System.out.println(key);
System.out.println(value);
}
for(int i=0;i<list.size();i++)
{
//System.out.println(it.next());
System.out.println((String)list.get(i));
context.write(key.getFirst(), new Text(list.get(i).toString()));
}
// context.write(key, values)
}
}
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Datadeduplication {
/**
* @param args
* @throws IOException
* @throws ClassNotFoundException
* @throws InterruptedException
*/
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
// TODO Auto-generated method stub
System.out.println("nihao");
Job job=new Job();
System.out.println("nihao");
job.setJobName("Datadeduplication");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(DatadeduplicationMapper.class);
job.setMapOutputKeyClass(TextPair.class);
job.setMapOutputValueClass(Text.class);
job.setPartitionerClass(FirstPartitioner.class);
job.setGroupingComparatorClass(GroupComparator.class);
//job.setSortComparatorClass(FirstGroupingComparator.class);
//job.setSortComparatorClass(GroupComparator.class);
//job.setGroupingComparatorClass(KeyComparator.class);
job.setReducerClass(DatadeduplicationReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.waitForCompletion(false);
}
}
이 내용에 흥미가 있습니까?
현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:
pandas 읽기 및 쓰기 Excelpandas 읽기와 쓰기 Excel은 중복된 데이터 가공 작업을 pandas에 맡기고 수동 노동을 절약하며 사용하기도 편리하지만 출력의 형식은 그다지 아름답지 않다.본고는 read_excel()과to_excel()의...
텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.