设为首页 收藏本站
查看: 550|回复: 0

[经验分享] 基于hadoop平台的pig语言对apache日志系统的分析

[复制链接]

尚未签到

发表于 2016-12-13 10:13:19 | 显示全部楼层 |阅读模式
pig脚本如下
register myudfs.jar;
DEFINE DayExtractor org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM-dd hh:mm:ss');
log = LOAD '/user/input/test/hp_analytics.ifensi.com-access_log.log' USING PigStorage() as (l1:chararray,l2:chararray,l3:chararray,l4:chararray,l5:chararray,l6:chararray,l7:chararray,l8:chararray,l9:chararray,l10:chararray);
log1 = FOREACH log GENERATE FLATTEN(STRSPLIT(l1, '\\,', 2))as (ip,otherargs),SUBSTRING(l4,1,21) as date,FLATTEN(REGEX_EXTRACT_ALL(l5,'\\"[^ ]* ([^ ]*) [^\\"]*\\"')) as url,FLATTEN(REGEX_EXTRACT_ALL(l8,'.{1}(.*).{1}')) as referer,FLATTEN(REGEX_EXTRACT_ALL(l9,'.{1}(.*).{1}')) as useragent,FLATTEN(REGEX_EXTRACT_ALL(l10,'.{1}(.*).{1}')) as vuid;
log2 = FILTER log1 BY SUBSTRING(vuid, 0, 4)=='vuid';
log3 = FOREACH log2 GENERATE ip, myudfs.DateExtractor(date) as date, FLATTEN(STRSPLIT(url, '\\?', 2)) AS (cmd, args), referer, useragent,FLATTEN(REGEX_EXTRACT_ALL(vuid,'.{5}(.*)')) as vuid;
SPLIT log3 INTO ihm IF cmd=='/__ihm.gif', ia IF cmd=='/__ia.gif';
-- ia process block
log4 = FOREACH ia GENERATE vuid, ip, FLATTEN(STRSPLIT(date, '\\|', 2)) AS (date, time), FLATTEN(REGEX_EXTRACT_ALL(args,'version=([^&]*)&(.*)')) as (ia_version, ia_other), referer,useragent;

SPLIT log4 INTO ia_version1 IF ia_version == '1.0', ia_version2 IF ia_version == '1.1';

log5 = FOREACH ia_version1 GENERATE vuid, ip,date,time, FLATTEN(REGEX_EXTRACT_ALL(ia_other,'browser=([^&]*)&browser_version=([^&]*)&operation_system=([^&]*)&operation_system_version=([^&]*)&flash_version=([^&]*)&java_enabled=([^&]*)&language=([^&]*)&screen_colors=([^&]*)&screen_resolution=([^&]*)&referrer=([^&]*)&tourl=([^&]*)&vuid=([^&]*)')) AS (ia_browser,ia_browser_version,ia_operation_system,ia_operation_system_version,ia_flash_version,ia_java_enabled,ia_language,ia_screen_colors,ia_screen_resolution,ia_referrer,ia_tourl,ia_vuid), referer, useragent;
result1  = FOREACH log5 GENERATE vuid,ip,date,time,ia_browser,ia_browser_version,ia_operation_system,ia_operation_system_version,ia_flash_version,ia_java_enabled,ia_language,ia_screen_colors,ia_screen_resolution,ia_referrer,ia_tourl,ia_vuid,referer,useragent;
STORE result1 INTO '/test/output/data/ia/ia_version1' USING PigStorage();

log6 = FOREACH ia_version2 GENERATE vuid, ip,date,time, FLATTEN(REGEX_EXTRACT_ALL(ia_other,'browser=([^&]*)&browser_version=([^&]*)&operation_system=([^&]*)&operation_system_version=([^&]*)&flash_version=([^&]*)&java_enabled=([^&]*)&language=([^&]*)&screen_colors=([^&]*)&screen_resolution=([^&]*)&referrer=([^&]*)&tourl=([^&]*)&title=([^&]*)&vuid=([^&]*)&muid=([^&]*)&mfid=([^&]*)&musername=([^&]*)&memail=([^&]*)')) AS (ia_browser,ia_browser_version,ia_operation_system,ia_operation_system_version,ia_flash_version,ia_java_enabled,ia_language,ia_screen_colors,ia_screen_resolution,ia_referrer,ia_tourl,ia_title,ia_vuid,ia_muid,ia_mfid,ia_musername,ia_memail), referer, useragent;
result2  = FOREACH log6 GENERATE vuid,ip,date,time,ia_browser,ia_browser_version,ia_operation_system,ia_operation_system_version,ia_flash_version,ia_java_enabled,ia_language,ia_screen_colors,ia_screen_resolution,ia_referrer,ia_tourl,ia_title,ia_vuid,referer,useragent,ia_muid,ia_mfid,ia_musername,ia_memail;
STORE result2 INTO '/test/output/data/ia/ia_version2' USING PigStorage();
-- ihm process block
ihm1 = FOREACH ihm GENERATE vuid,ip,FLATTEN(STRSPLIT(date, '\\|', 2)) AS (date, time),FLATTEN(REGEX_EXTRACT_ALL(args,'version=([^&]*)&(.*)')) as (ihm_version, ihm_other),referer,useragent;
ihm2 = FOREACH ihm1 GENERATE vuid,ip,date,time,ihm_version,FLATTEN(REGEX_EXTRACT_ALL(ihm_other,'vuid=([^&]*)&url=([^&]*)&width=([^&]*)&x=([^&]*)&y=(.*)')) as (ihm_vuid,ihm_url,ihm_width,ihm_x,ihm_y),referer,useragent;
ihm3 = FOREACH ihm2 GENERATE vuid,ip,date,time,ihm_vuid,ihm_url,ihm_x,ihm_y,ihm_width,referer,useragent;
STORE ihm3 INTO '/test/output/data/ihm' USING PigStorage();
附件为部分日志文件

运维网声明 1、欢迎大家加入本站运维交流群:群②:261659950 群⑤:202807635 群⑦870801961 群⑧679858003
2、本站所有主题由该帖子作者发表,该帖子作者与运维网享有帖子相关版权
3、所有作品的著作权均归原作者享有,请您和我们一样尊重他人的著作权等合法权益。如果您对作品感到满意,请购买正版
4、禁止制作、复制、发布和传播具有反动、淫秽、色情、暴力、凶杀等内容的信息,一经发现立即删除。若您因此触犯法律,一切后果自负,我们对此不承担任何责任
5、所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其内容的准确性、可靠性、正当性、安全性、合法性等负责,亦不承担任何法律责任
6、所有作品仅供您个人学习、研究或欣赏,不得用于商业或者其他用途,否则,一切后果均由您自己承担,我们对此不承担任何法律责任
7、如涉及侵犯版权等问题,请您及时通知我们,我们将立即采取措施予以解决
8、联系人Email:admin@iyunv.com 网址:www.yunweiku.com

所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其承担任何法律责任,如涉及侵犯版权等问题,请您及时通知我们,我们将立即处理,联系人Email:kefu@iyunv.com,QQ:1061981298 本贴地址:https://www.iyunv.com/thread-313652-1-1.html 上篇帖子: Hadoop+hbase+thrift H.H.T环境部署 下篇帖子: 对配hadoop时用到的一些命令,查到的详细解释做个笔记
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

扫码加入运维网微信交流群X

扫码加入运维网微信交流群

扫描二维码加入运维网微信交流群,最新一手资源尽在官方微信交流群!快快加入我们吧...

扫描微信二维码查看详情

客服E-mail:kefu@iyunv.com 客服QQ:1061981298


QQ群⑦:运维网交流群⑦ QQ群⑧:运维网交流群⑧ k8s群:运维网kubernetes交流群


提醒:禁止发布任何违反国家法律、法规的言论与图片等内容;本站内容均来自个人观点与网络等信息,非本站认同之观点.


本站大部分资源是网友从网上搜集分享而来,其版权均归原作者及其网站所有,我们尊重他人的合法权益,如有内容侵犯您的合法权益,请及时与我们联系进行核实删除!



合作伙伴: 青云cloud

快速回复 返回顶部 返回列表