PatentsView:
PatentsView is a free, user-friendly platform that provides access to
USPTO patent data in a variety of formats. The platform allows you to
search and download patent data by keyword, patent number, inventor,
assignee, and other criteria. You can also visualize the data and export
it to CSV or JSON formats. To use PatentsView, simply go to their
website (https://www.patentsview.org/)
Example on the patent dataset:
Step 1:Based on the given data, it appears to be a list of records with two fields separated by a space character. The first field appears to be a unique identifier, and the second field appears to be a list of patent numbers associated with that identifier.
Now to count the total number of patents in the dataset, you would need to parse each line and count the number of patents associated with each identifier. Here's an example :
1 3964859,4647229
10000 4539112
100000 5031388
1000006 4714284
1000007 4766693
1000011 5033339
1000017 3908629
1000026 4043055
1000033 4190903,4975983
1000043 4091523
1000044 4082383,4055371
1000045 4290571
1000046 5918892,5525001
1000049 5996916
1000051 4541310
1000054 4946631
1000065 4748968
1000067 5312208,4944640,5071294
1000070 4928425,5009029
10000 4539112
100000 5031388
1000006 4714284
1000007 4766693
1000011 5033339
1000017 3908629
1000026 4043055
1000033 4190903,4975983
1000043 4091523
1000044 4082383,4055371
1000045 4290571
1000046 5918892,5525001
1000049 5996916
1000051 4541310
1000054 4946631
1000065 4748968
1000067 5312208,4944640,5071294
1000070 4928425,5009029
Step 2:Create a file with PatentCount.java with the following program:
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class PatentCount {
public static class PatentCountMapper extends Mapper<Object, Text, Text, IntWritable> {
private final Text identifier = new Text();
private final IntWritable count = new IntWritable();
@Override
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String[] fields = value.toString().split(" ");
identifier.set(fields[0]);
count.set(fields[1].split(",").length);
context.write(identifier, count);
}
}
public static class PatentCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private final IntWritable result = new IntWritable();
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "PatentCount");
job.setJarByClass(PatentCount.class);
job.setMapperClass(PatentCountMapper.class);
job.setReducerClass(PatentCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
import java.util.Arrays;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class PatentCount {
public static class PatentCountMapper extends Mapper<Object, Text, Text, IntWritable> {
private final Text identifier = new Text();
private final IntWritable count = new IntWritable();
@Override
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String[] fields = value.toString().split(" ");
identifier.set(fields[0]);
count.set(fields[1].split(",").length);
context.write(identifier, count);
}
}
public static class PatentCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private final IntWritable result = new IntWritable();
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "PatentCount");
job.setJarByClass(PatentCount.class);
job.setMapperClass(PatentCountMapper.class);
job.setReducerClass(PatentCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Step 3:Generate class files for each java file using the following commands
hduser@ubuntu:~/patentcount$ export CLASSPATH=`hadoop classpath`
hduser@ubuntu:~/patentcount$ echo $CLASSPATH
hduser@ubuntu:~/patentcount$ javac -d . PatentCount.java
hduser@ubuntu:~/patentcount$ echo $CLASSPATH
hduser@ubuntu:~/patentcount$ javac -d . PatentCount.java
Step 4:Create a jar file using the following command
hduser@ubuntu:~/patentcount$ jar -cvf count.jar -C /home/hduser/patentcount .
Step 5:create a folder rkpd and then copy the patent dataset file dataset.txt under DFS using the following commands
hduser@ubuntu:~/patentcount$ hadoop fs -mkdir /rkpd
hduser@ubuntu:~/patentcount$ hadoop fs -put dataset.txt /rkpd
hduser@ubuntu:~/patentcount$ hadoop fs -lsr /rkpd
hduser@ubuntu:~/patentcount$ hadoop fs -mkdir /rkpd
hduser@ubuntu:~/patentcount$ hadoop fs -put dataset.txt /rkpd
hduser@ubuntu:~/patentcount$ hadoop fs -lsr /rkpd
Step 6: Now run the hadoop jar command
hduser@ubuntu:~/patentcount$ hadoop jar count.jar PatentCount /rkpd/dataset.txt /rkpd/out
Step 7:Now we can check the patent dataset count for the given dataset.txt in a folder rkpd/dataset.txt
Step 7:Now we can check the patent dataset count for the given dataset.txt in a folder rkpd/dataset.txt
No comments:
Post a Comment