四、基于hadoop的nginx访问日志分析---top 10 request
代码:
# coding=utf-8 from mrjob.job import MRJob from mrjob.step import MRStep from nginx_accesslog_parser import NginxLineParser import heapq class UrlRequest(MRJob): nginx_line_parser = NginxLineParser() def mapper(self, _, line): self.nginx_line_parser.parse(line) yield self.nginx_line_parser.request, 1 def reducer_sum(self, key, values): yield None, (sum(values), key) def reducer_top10(self, _, values): for count, path in heapq.nlargest(10, values): yield count, path # for count, path in sorted(values, reverse=True)[:10]: # yield count, path def steps(self):
return ( MRStep(mapper=self.mapper, reducer=self.reducer_sum ), MRStep(reducer=self.reducer_top10) ) def main(): UrlRequest.run() if __name__ == '__main__': main()
结果:
# python3 top_10_request.py access_all.log-20161227 No configs found; falling back on auto-configuration Creating temp directory /tmp/top_10_request.root.20161228.055055.459306 Running step 1 of 2... Running step 2 of 2... Streaming final output from /tmp/top_10_request.root.20161228.055055.459306/output... 62728 "/forum.php" 47274 "/index.php" 45777 "/sync/avatar.php" 34568 "/" 12358 "/home.php" 3889 "/misc.php" 3044 "/static/image/common/swfupload.swf" 2666 "/thread-114874-1-1.html" 1585 "/favicon.ico" 1143 "/data/cache/style_2_common.css" Removing temp directory /tmp/top_10_request.root.20161228.055055.459306...
代码解释:
mapper()方法:接收一行访问日志,将数据解析成key=请求的URL,value=1
reducer_sum()方法:计算出每个请求的URL的访问量,并输出 null [3, "/forum.php"]这种形式
reducer_top10()方法:对[3, "/forum.php"]这样数据进行排序输出