youngfan007 发表于 2016-12-9 08:52:31

Hadoop 统计专利被那些专利所引用(一)

  一、以下是测试数据:

"CITING","CITED"
3858241,956203
3858241,1324234
3858241,3398406
3858241,3557384
3858241,3634889
3858242,1515701
3858242,3319261
3858242,3668705
3858242,3707004
3858243,2949611
3858243,3146465
3858243,3156927
3858243,3221341
3858243,3574238
3858243,3681785
3858243,3684611
3858244,14040
3858244,17445
3858245,17445
  注:第一列是专利号,第二列是引用的专利号。
  二、Hadoop 代码如下:

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueLineRecordReader;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class PatentCitations extends Configured implements Tool {
public static class PatentCitationsMapper extends Mapper<Text, Text, Text, Text> {
@Override
protected void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
context.write(value, key);
}
}
public static class PatentCitationsReduces extends Reducer<Text, Text, Text, Text> {
private static Text staticVal = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values,Context context)
throws IOException, InterruptedException {
StringBuilder sb = new StringBuilder();
for (Text value : values) {
if (sb.length() > 0) {
sb.append(",");
}
sb.append(value.toString());
}
staticVal.set(sb.toString());
context.write(key,staticVal);
}
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
conf.set(KeyValueLineRecordReader.KEY_VALUE_SEPERATOR, ",");
Job job = new Job(getConf());
job.setJarByClass(getClass());
job.setJobName("patentcitations");
job.setInputFormatClass(KeyValueTextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(PatentCitationsMapper.class);
job.setReducerClass(PatentCitationsReduces.class);
FileInputFormat.setInputPaths(job, new Path("/patent/test/input/file1.txt"));
FileOutputFormat.setOutputPath(job, new Path("/patent/test/output"));
//FileInputFormat.setInputPaths(job, new Path(args));
//FileOutputFormat.setOutputPath(job, new Path(args));
boolean success = job.waitForCompletion(true);
return success ? 0: 1;
}
public static void main(String[] args) throws Exception{
int result = ToolRunner.run(new PatentCitations(), args);
System.exit(result);
}
}

  三、执行结果如下:

"CITED""CITING"
13242343858241
140403858244
15157013858242
174453858245,3858244
29496113858243
31464653858243
31569273858243
32213413858243
33192613858242
33984063858241
35573843858241
35742383858243
36348893858241
36687053858242
36817853858243
36846113858243
37070043858242
9562033858241
  注:17445 分别被 3858245,3858244 所引用。
页: [1]
查看完整版本: Hadoop 统计专利被那些专利所引用(一)