import re import time import sys combined_format_re = re.compile(r'''(?P.*?) (?P.*?) (?P.*?) \[(?P.*?)\] "(?P.*?) (?P.*?)(?P\?.*?)? (?P.*?)" (?P\d*) (?P.*?) "(?P.*?)" "(?P.*?)"''') class bunch(object): pass month_list = {"Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "Jun": 6, "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12 } MAX_LINE_SIZE = 1200 class Parser(object): _url_norm_re = re.compile('//+') def __init__(self, file_obj): self._fd = file_obj def __iter__(self): fd = self._fd url_norm_re = self._url_norm_re cursor = bunch() cursor._fd = fd attrs = ''' client ident authuser date method proto referer agent'''.split() while 1: line = self._fd.readline(MAX_LINE_SIZE) if not line: break match = combined_format_re.search(line) if not match: if line.strip(): sys.stderr.write('Bad line: %s' % line) continue for attr in attrs: setattr(cursor, attr, match.group(attr)) cursor.utime = self.make_time(match.group('date')) cursor.status = int(match.group('status')) b = match.group('bytes') if b == '-': cursor.bytes = 0 else: cursor.bytes = int(b) url = url_norm_re.sub('/', match.group('url')) if url.endswith('/index.html'): url = url[:-10] cursor.url = url yield cursor def make_time(self, date): return time.mktime(time.strptime(date, '%d/%b/%Y:%H:%M:%S +0000'))