执行脚本:
PIGGYBANK_PATH=$PIG_HOME/contrib/piggybank/java/piggybank.jar
INPUT=pig/input/test-pig-full.txt
OUTPUT=pig/output/test-pig-output-$(date +%Y%m%d%H%M%S)
PIGSCRIPT=analyst_status_logs.pig
#analyst_500_404_month.pig
#analyst_500_404_day.pig
#analyst_404_percentage.pig
#analyst_500_percentage.pig
#analyst_unique_path.pig
#analyst_user_logs.pig
#analyst_status_logs.pig
pig -p PIGGYBANK_PATH=$PIGGYBANK_PATH -p INPUT=$INPUT -p OUTPUT=$OUTPUT $PIGSCRIPT
要分析的数据源,LOG 文件
46.20.45.18 - - [25/Dec/2012:23:00:25 +0100] "GET / HTTP/1.0" 302 - "-" "Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)" "-" "-" 46.20.45.18 "" 11011AEC9542DB0983093A100E8733F8 0
46.20.45.18 - - [25/Dec/2012:23:00:25 +0100] "GET /sign-in.jspx HTTP/1.0" 200 3926 "-" "Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)" "-" "-" 46.20.45.18 "" 11011AEC9542DB0983093A100E8733F8 0
69.59.28.19 - - [25/Dec/2012:23:01:25 +0100] "GET / HTTP/1.0" 302 - "-" "Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)" "-" "-" 69.59.28.19 "" 36D80DE7FE52A2D89A8F53A012307B0A 15
PIG脚本:
--注册JAR包,因为要用到DateExtractor
register '$PIGGYBANK_PATH';
--声明一个短函数名
DEFINE DATE_EXTRACT_MM
org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM');
DEFINE DATE_EXTRACT_DD
org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM-dd');
-- pig/input/test-pig-full.txt
--把数据从变量所指的文件加载到PIG中,并定义数据列名,此时的数据集为数组(a,b,c)
raw_logs = load '$INPUT' USING org.apache.pig.piggybank.storage.MyRegExLoader('^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] "(\\S+) (\\S+) (HTTP[^"]+)" (\\S+) (\\S+) "([^"]*)" "([^"]*)" "(\\S+)" "(\\S+)" (\\S+) "(.*)" (\\S+) (\\S+)')
as (remoteAddr: chararray,
n2: chararray,
n3: chararray,
time: chararray,
method: chararray,
path:chararray,
protocol:chararray,
status: int,
bytes_string: chararray,
referrer: chararray,
browser: chararray,
n10:chararray,
remoteLogname: chararray,
remoteAddr12: chararray,
path2: chararray,
sessionid: chararray,
n15: chararray
);
--过滤数据
filter_logs = FILTER raw_logs BY not (browser matches '.*pingdom.*');
--item_logs = FOREACH raw_logs GENERATE browser;
--percent 500 logs
--重定义数据项,数据集只取2项status,month
reitem_percent_500_logs = FOREACH filter_logs GENERATE status,DATE_EXTRACT_MM(time) as month;
--分组数据集,此时的数据结构为MAP(a{(aa,bb,cc),(dd,ee,ff)},b{(bb,cc,dd),(ff,gg,hh)})
group_month_percent_500_logs = GROUP reitem_percent_500_logs BY (month);
--重定义分组数据集数据项,进行分组统计,此时要联合分组数据集和原数据集统计
final_month_500_logs = FOREACH group_month_percent_500_logs
{
--对原数据集做count,因为是在foreachj里做count的,即使是对原数据集,也会自动会加month==group的条件
--从这里可以看出对于group里的数据集,完全没用到
--这时是以每一行为单位的,统计MAP中的KEY-a对应的数组在原数据集中的个数
total = COUNT(reitem_percent_500_logs);
--对原数据集做filter,因为是在foreachj里做count的,即使是对原数据集,也会自动会加month==group的条件
--重新过滤一下原数据集,得到status==500,month==group的数据集
t = filter reitem_percent_500_logs by status== 500; --create a bag which contains only T values
--重定义数据项,取group,统计结果
generate flatten(group) as col1, 100*(double)COUNT(t)/(double)total;
}
STORE final_month_500_logs into '$OUTPUT' using PigStorage(',');