scuess 发表于 2016-12-28 09:36:22

【Spark七十七】Spark分析Nginx和Apache的access.log

  Spark分析Nginx和Apache的access.log,第一个问题是要对Nginx和Apache的access.log文件进行按行解析,按行解析就的方法是正则表达式:
  Nginx的access.log解析正则表达式

val PATTERN = """([^ ]*) ([^ ]*) ([^ ]*) (\\[.*\\]) (\".*?\") (-|*) (-|*) (\".*?\") (\".*?\")""".r
  Apache的access.log解析正则表达式

val PATTERN = """^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)""".r
  注意最后一行的.r用于指明PARTTERN是一个正则表达式对象,String.r返回的是Regex类型的对象
  ApacheAccess日志解析工具类

/**
* information container,like JavaBean
*/
case class ApacheAccessLog(
ipAddress: String,
clientIdentd: String,
userId: String,
dateTime: String,
method: String,
endpoint: String,
protocol: String,
responseCode: Int,
contentSize: Long) {
}
/**
* Retrieve information from log line using Regular Expression
*/
object ApacheAccessLog {
val PATTERN = """^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)""".r
def parseLogLine(log: String): ApacheAccessLog = {
val res = PATTERN.findFirstMatchIn(log)
if (res.isEmpty) {
throw new RuntimeException("Cannot parse log line: " + log)
}
val m = res.get
ApacheAccessLog(m.group(1), m.group(2), m.group(3), m.group(4),
m.group(5), m.group(6), m.group(7), m.group(8).toInt, m.group(9).toLong)
}
def main(args: Array) {
val line = """192.13.212.25 - - "GET /abc/ HTTP/1.1" 200 280"""
val log = ApacheAccessLog.parseLogLine(line);
println(log.ipAddress)
println(log.clientIdentd)
println(log.userId)
println(log.dateTime)
println(log.method)
println(log.endpoint)
println(log.protocol)
println(log.responseCode)
println(log.contentSize)
}
}
  http://www.iteblog.com/archives/1250
页: [1]
查看完整版本: 【Spark七十七】Spark分析Nginx和Apache的access.log