统计域名(实际是host)的计数器。
输入:一个文件夹中有一堆的文本文件,内容是一行一个的url,可以想像为数据库中的一条记录
流程:提取url的domain,对domain计数+1
输出:域名,域名计数
代码如下:
Mapper
package com.keseek.hadoop;
data:image/s3,"s3://crabby-images/9e1b5/9e1b5b2a3e46b5341b22649797d1794392182f55" alt=""
import java.io.IOException;
import java.net.URI;
data:image/s3,"s3://crabby-images/9e1b5/9e1b5b2a3e46b5341b22649797d1794392182f55" alt=""
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.Mapper;
data:image/s3,"s3://crabby-images/9e1b5/9e1b5b2a3e46b5341b22649797d1794392182f55" alt=""
public class DomainCountMapper implements
data:image/s3,"s3://crabby-images/2a1f3/2a1f35146451967292b66fa62c8f22027e7067cf" alt=""
Mapper<LongWritable, Text, Text, LongWritable>
{
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
@Override
data:image/s3,"s3://crabby-images/8d7d9/8d7d99ac571b1efcbf1f7e7a4120707c8e90d1fd" alt=""
public void configure(JobConf arg0)
{
// Init Text and LongWritable
domain = new Text();
one = new LongWritable(1);
}
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
@Override
data:image/s3,"s3://crabby-images/8d7d9/8d7d99ac571b1efcbf1f7e7a4120707c8e90d1fd" alt=""
public void close() throws IOException
{
// TODO Auto-generated method stub
}
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
@Override
public void map(LongWritable key, Text value,
OutputCollector<Text, LongWritable> output, Reporter reporter)
data:image/s3,"s3://crabby-images/8d7d9/8d7d99ac571b1efcbf1f7e7a4120707c8e90d1fd" alt=""
throws IOException
{
// Get URL
String url = value.toString().trim();
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
// URL->Domain && Collect
domain.set(ParseDomain(url));
data:image/s3,"s3://crabby-images/8d7d9/8d7d99ac571b1efcbf1f7e7a4120707c8e90d1fd" alt=""
if (domain.getLength() != 0)
{
output.collect(domain, one);
}
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
}
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
data:image/s3,"s3://crabby-images/8d7d9/8d7d99ac571b1efcbf1f7e7a4120707c8e90d1fd" alt=""
public String ParseDomain(String url)
{
data:image/s3,"s3://crabby-images/8d7d9/8d7d99ac571b1efcbf1f7e7a4120707c8e90d1fd" alt=""
try
{
URI uri = URI.create(url);
return uri.getHost();
data:image/s3,"s3://crabby-images/8d7d9/8d7d99ac571b1efcbf1f7e7a4120707c8e90d1fd" alt=""
} catch (Exception e)
{
return "";
}
}
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
// Shared used Text domain
private Text domain;
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
// One static
private LongWritable one;
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
}Reducer
package com.keseek.hadoop;
data:image/s3,"s3://crabby-images/9e1b5/9e1b5b2a3e46b5341b22649797d1794392182f55" alt=""
import java.io.IOException;
import java.util.Iterator;
data:image/s3,"s3://crabby-images/9e1b5/9e1b5b2a3e46b5341b22649797d1794392182f55" alt=""
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.Reducer;
data:image/s3,"s3://crabby-images/9e1b5/9e1b5b2a3e46b5341b22649797d1794392182f55" alt=""
public class DomainCountReducer implements
data:image/s3,"s3://crabby-images/2a1f3/2a1f35146451967292b66fa62c8f22027e7067cf" alt=""
Reducer<Text, LongWritable, Text, LongWritable>
{
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
@Override
data:image/s3,"s3://crabby-images/8d7d9/8d7d99ac571b1efcbf1f7e7a4120707c8e90d1fd" alt=""
public void configure(JobConf arg0)
{
// TODO Auto-generated method stub
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
}
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
@Override
data:image/s3,"s3://crabby-images/8d7d9/8d7d99ac571b1efcbf1f7e7a4120707c8e90d1fd" alt=""
public void close() throws IOException
{
// TODO Auto-generated method stub
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
}
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
@Override
public void reduce(Text key, Iterator<LongWritable> values,
OutputCollector<Text, LongWritable> output, Reporter reporter)
data:image/s3,"s3://crabby-images/8d7d9/8d7d99ac571b1efcbf1f7e7a4120707c8e90d1fd" alt=""
throws IOException
{
// Count the domain
long cnt = 0;
data:image/s3,"s3://crabby-images/8d7d9/8d7d99ac571b1efcbf1f7e7a4120707c8e90d1fd" alt=""
while (values.hasNext())
{
cnt += values.next().get();
}
// Output
output.collect(key, new LongWritable(cnt));
}
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
}Main
package com.keseek.hadoop;
data:image/s3,"s3://crabby-images/9e1b5/9e1b5b2a3e46b5341b22649797d1794392182f55" alt=""
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
data:image/s3,"s3://crabby-images/9e1b5/9e1b5b2a3e46b5341b22649797d1794392182f55" alt=""
data:image/s3,"s3://crabby-images/2a1f3/2a1f35146451967292b66fa62c8f22027e7067cf" alt=""
public class DomainCountMain
{
data:image/s3,"s3://crabby-images/8d7d9/8d7d99ac571b1efcbf1f7e7a4120707c8e90d1fd" alt=""
public static void main(String[] args) throws Exception
{
// Param for path
data:image/s3,"s3://crabby-images/8d7d9/8d7d99ac571b1efcbf1f7e7a4120707c8e90d1fd" alt=""
if (args.length != 2)
{
System.out.println("Usage:");
System.out
.println("DomainCountMain.jar <Input_Path> <Outpu_Path>");
System.exit(-1);
}
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
// Configure JobConf
JobConf jobconf = new JobConf(DomainCountMain.class);
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
jobconf.setJobName("Domain Counter by Coder4");
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
FileInputFormat.setInputPaths(jobconf, new Path(args[0]));
FileOutputFormat.setOutputPath(jobconf, new Path(args[1]));
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
jobconf.setInputFormat(TextInputFormat.class);
jobconf.setOutputFormat(TextOutputFormat.class);
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
jobconf.setMapperClass(DomainCountMapper.class);
jobconf.setReducerClass(DomainCountReducer.class);
jobconf.setCombinerClass(DomainCountReducer.class);
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
jobconf.setMapOutputKeyClass(Text.class);
jobconf.setMapOutputValueClass(LongWritable.class);
jobconf.setOutputKeyClass(Text.class);
jobconf.setOutputValueClass(LongWritable.class);
data:image/s3,"s3://crabby-images/96c01/96c01a9005d00151a1af2189b6a9f266915ba654" alt=""
// Run job
RunningJob run = JobClient.runJob(jobconf);
run.waitForCompletion();
data:image/s3,"s3://crabby-images/8d7d9/8d7d99ac571b1efcbf1f7e7a4120707c8e90d1fd" alt=""
if (run.isSuccessful())
{
System.out.println("<<<DomainCount Main>>> success.");
data:image/s3,"s3://crabby-images/8d7d9/8d7d99ac571b1efcbf1f7e7a4120707c8e90d1fd" alt=""
} else
{
System.out.println("<<<DomainCount Main>>> error.");
}
}
}