例子很简单,我没有运行自带的wordcount,而是自己做了一个简单的例子。
实现的功能是从我们的nginx的access log里面计算url访问的次数。
access log文件:
10.2.112.22 - - [11/Apr/2012:10:25:31 +0800] "GET /bf5bd91c/css/base/base_jiexi-all-min.css HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:31 +0800] "GET /bf5bd91c/js/lib/lib-min.js HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:31 +0800] "GET /image/jiexi/logo.png HTTP/1.1" 304 0 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:31 +0800] "GET /bf5bd91c/js/page/jiexi/index-min.js HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:31 +0800] "GET /bf5bd91c/css/page/jiexi/index-all-min.css HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/css/page/jiexi/index-all-min.css HTTP/1.1" 499 0 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/js/page/jiexi/index-min.js HTTP/1.1" 499 0 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/js/lib/lib-min.js HTTP/1.1" 499 0 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/css/base/base_jiexi-all-min.css HTTP/1.1" 499 0 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /bf5bd91c/css/page/jiexi/index-all-min.css HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /bf5bd91c/js/page/jiexi/index-min.js HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/js/page/jiexi/index-min.js HTTP/1.1" 499 0 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/css/page/jiexi/index-all-min.css HTTP/1.1" 499 0 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /bf5bd91c/css/base/base_jiexi-all-min.css HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /bf5bd91c/js/lib/lib-min.js HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /bf5bd91c/css/page/jiexi/index-all-min.css HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /bf5bd91c/js/page/jiexi/index-min.js HTTP/1.1" 302 161 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/js/page/jiexi/index-min.js HTTP/1.1" 200 56215 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/css/page/jiexi/index-all-min.css HTTP/1.1" 200 21254 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/css/base/base_jiexi-all-min.css HTTP/1.1" 200 22782 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
10.2.112.22 - - [11/Apr/2012:10:25:32 +0800] "GET /release/js/lib/lib-min.js HTTP/1.1" 200 137514 "http://www.jiexi.com/home" "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.28) Gecko/20120306 Firefox/3.6.28"
新建maven项目:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.jiexi</groupId>
<artifactId>jiexi-examples</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>1.0.2</version>
</dependency>
</dependencies>
</project>
Mapper代码如下:
package com.jiexi.examples.hadoop;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
public class AccessLogMapper extends MapReduceBase implements
Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text url = new Text();
static String POST = "\"POST ";
static String GET = "\"GET ";
static String END = " HTTP/1.0";
public void map(LongWritable key, Text value,
OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
String line = value.toString();
String url1 = getUrl(line);
url.set(url1);
output.collect(url, one);
}
public static void main(String[] args) {
String a = "10.2.112.34 - - [06/Mar/2012:18:05:41 +0800] \"GET /mine?originUrl= HTTP/1.0\" 302 -";
String b = "10.2.112.34 - - [06/Mar/2012:15:02:42 +0800] \"POST /user/login?originUrl=http%3A%2F%2Fwww.jiexi.com%2Fhome HTTP/1.0\" 200 25";
// System.out.println(getUrl(a));
// System.out.println(getUrl(b));
String s =" /user/register?originUrl=http%3A%2F%2Fwww.jiexi.com%2Fhome";
System.out.println(s.substring(0,s.indexOf("?")));
}
private static String getUrl(String a) {
// int len = POST.length();
int begin = a.indexOf(POST);
int get = a.indexOf(GET);
if (get > -1) {
begin = get;
// len = GET.length();
}
int end = a.indexOf(END);
String url = a.substring(begin + 1, end);
if (url.indexOf("?") > 0) {
return url.substring(0, url.indexOf("?"));
}
return url;
}
}
Reducer代码如下:
package com.jiexi.examples.hadoop;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
public class AccessLogReducer extends MapReduceBase implements
Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values,
OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
job调用
package com.jiexi.examples.hadoop;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
public class AccessLogPerDayJob {
public static void main(String[] args) throws Throwable {
JobConf jobConf = new JobConf(AccessLogPerDayJob.class);
jobConf.setJobName("access_log");
jobConf.setOutputKeyClass(Text.class);
jobConf.setOutputValueClass(IntWritable.class);
jobConf.setMapperClass(AccessLogMapper.class);
jobConf.setCombinerClass(AccessLogReducer.class);
jobConf.setReducerClass(AccessLogReducer.class);
jobConf.setInputFormat(TextInputFormat.class);
jobConf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.addInputPath(jobConf, new Path(args[0]));
FileOutputFormat.setOutputPath(jobConf, new Path(args[1]));
JobClient.runJob(jobConf);
}
}
===============================================
mvn clean package
丢到namenode的/opt/hadoop 下面
运行:
./bin/hadoop dfs -mkdir access_log_in
./bin/hadoop dfs -ls
#拷贝本地日志文件到hdfs中
./bin/hadoop dfs -put /opt/access_log/* access_log_in
#运行,把access_log.jar拷贝到/opt/hadoop下面
./bin/hadoop jar access_log.jar com.jiexi.examples.hadoop.AccessLogPerDayJob access_log_in access_log_out
查看job运行情况:
http://10.2.112.31:50030/jobtracker.jsp 查看data:
./bin/hadoop dfs -ls access_log_out