import httpcodes import sys import os import api import time try: import DNS except ImportError: DNS = None def iterwatch(watch_func, iterator): """ Make a new iterator which yields all the items from iterator, but first calls watch_func(item) for each item. """ for item in iterator: watch_func(item) yield item def pass_through(watchers): """ Make a single watcher from a list of watchers, by using pass_through to send data from one watcher to the next. """ watcher = last_watcher = watchers[0] for next_watcher in watchers[1:]: last_watcher.pass_through = next_watcher last_watcher = next_watcher return watcher def make_link(url, max=70, follow=False): if max and len(url) > max: text = url[:max-20] + '...' + url[-20:] else: text = url if not follow: rel = ' rel="nofollow"' else: rel = '' return '%s' % (url, rel, text) class Watcher(api.Pipe): pass_through = None _abstract = 'Watcher' def title__get(self): return self.__class__.__name__ title = property(title__get) def is_abstract(cls): return cls.__name__ == cls._abstract is_abstract = classmethod(is_abstract) def name__get(self): return self.__class__.__name__ name = property(name__get) def add_pass_through(self, pass_through): self.pass_through = pass_through return self def iterlog(self, log_items): if not self.pass_through: return iter(self.watchiter(log_items)) else: return iter(self.pass_through.iterlog( self.watchiter(log_items))) def watchiter(self, log_items): for log in log_items: self(log) yield log class Hits(Watcher): count = 0 min_time = time.time()+100 max_time = 0 bytes = 0 def __call__(self, log): self.min_time = min(self.min_time, log.utime) self.max_time = max(self.max_time, log.utime) self.count += 1 self.bytes += log.bytes def report(self): diff = self.max_time - self.min_time self.days = diff / 60 / 60 / 24 self.weeks = self.days / 7.0 self.months = self.weeks / 7.0 parts = [ 'Total hits: %s' % self.format_amount(self.count), '%s/day (%i days)' % (self.format_amount(self.count / self.days), self.days), '%s/week (%i weeks)' % (self.format_amount(self.count / self.weeks), self.weeks), '%s/month (%.1f months)' % (self.format_amount(self.count / self.months), self.months), ] self.add_byte_report(parts) return '
'.join(parts) def add_byte_report(self, parts): parts.extend([ 'Traffic: %s' % self.format_bytes(self.bytes), '%s/month' % (self.format_bytes(self.bytes/self.months)), ]) def format_amount(self, count): s = str(int(count)) parts = [] while len(s) > 3: parts.append(s[-3:]) s = s[:-3] parts.append(s) parts.reverse() return ','.join(parts) def format_bytes(self, bytes): if bytes < 1000: return '%i bytes' % bytes if bytes < 1000*1000: return '%iKb' % (bytes/1000) if bytes < 1000*1000*10: return '%.1fMb' % (bytes/1000./1000.) if bytes < 1000*1000*1000: return '%iMb' % (bytes/1000/1000) return '%.1fGb' % (bytes/1000/1000/1000) class Bytes(Hits): def __call__(self, log): self.min_time = min(self.min_time, log.utime) self.max_time = max(self.max_time, log.utime) self.count += log.bytes def format_amount(self, bytes): if bytes > 1024: if bytes > 1024*1024: if bytes > 1024*1024*1024: return '%.1fGb' % (bytes / 1024.0 / 1024.0 / 1024.0) return '%.1fMb' % (bytes / 1024.0 / 1024.0) return '%iKb' % (bytes / 1024) else: return '%i bytes' class UniqueHits(Hits): def __init__(self, **kw): Hits.__init__(self, **kw) self.seen = {} def __call__(self, log): self.min_time = min(self.min_time, log.utime) self.max_time = max(self.max_time, log.utime) if not self.seen.has_key(log.client): self.count += 1 self.seen[log.client] = None def add_byte_report(self, bytes): pass class AccumulateCriteria(Watcher): allow_empty = False _abstract = 'AccumulateCriteria' minimum = 0 maximum = None exclude_media = False top = None media_extensions = ('gif', 'jpg', 'png', 'css', 'js', 'swf', 'ico') configurable = Watcher.configurable + [ api.Configurable('minimum', int, 'Minimum count to display'), api.Configurable('maximum', int, 'Maximum count to display'), api.Configurable('top', api.top_or_percent, 'Top number or percent'), api.Configurable('exclude_media', bool, 'Don\'t match media files'), ] def __init__(self, **kw): Watcher.__init__(self, **kw) self.data = {} def __call__(self, log): if self.exclude_media: ext = log.url.split('.')[-1].lower() if ext in self.media_extensions: return value = self.criteria(log) if value is None and not self.allow_empty: return self.data[value] = self.data.get(value, 0) + 1 def report(self): rows = [] items = self.data.items() items.sort(lambda a, b: cmp(b[1], a[1])) top = self.top if top and top < 1: # % top = int(len(items) * top) max_count = 0 for n, (value, count) in enumerate(items): if top and n > top: break if count < self.minimum: continue if self.maximum and count > self.maximum: continue formatted = self.format(value) if formatted is None: continue if not max_count: max_count = count length = int(100 * count / max_count) if length >= 10: bar = '%s%%' % (length, length) else: bar = '' rows.append('%s%s%s\n' % (count, bar, formatted)) if not rows: return '

No results found

' return '%s
' % ''.join(rows) def format(self, value): return value class ReferrerCount(AccumulateCriteria): require = ('search_engine', 'referer') local_domains = [] configurable = AccumulateCriteria.configurable + [ api.Configurable( 'local_domains', (list, str), 'All domains to be considered local')] def criteria(self, log): if log.search_engine: return None referer = log.referer # Should simplify referrers; or have an # AnnotateCanonicalReferrer? if not referer.startswith('http://'): # @@: should really normalize these first anyway return None if self.local_domains: domain = referer[len('http://'):] pos = domain.find('/') if pos != -1: domain = domain[:pos] if domain in self.local_domains: return None if referer.endswith('/index.html'): referer = referer[:-10] elif referer.endswith('/index.htm'): referer = referer[:-9] return referer def format(self, value): find_v = value.lower() for s in self.bad_domain_words: if find_v.find(s) != -1: return None if not value or value == '-': return None return self.make_link(value) def make_link(self, value): return make_link(value) def read_bad_domain_words(cls): cls.bad_domain_words = [] f = open(os.path.join(os.path.dirname(__file__), 'bad_domain_words.txt')) for line in f: if not line.strip() or line.strip().startswith('#'): continue cls.bad_domain_words.append(line.strip().lower()) read_bad_domain_words = classmethod(read_bad_domain_words) ReferrerCount.read_bad_domain_words() class DomainReferrerCount(ReferrerCount): def criteria(self, log): refer = ReferrerCount.criteria(self, log) if not refer: return None domain = refer[7:].split('/', 1)[0] return domain.lower() def make_link(self, value): return make_link('http://' + value) class ClientsCount(AccumulateCriteria): require = ('client',) def criteria(self, log): return log.client def report(self): if DNS: DNS.ParseResolvConf() return AccumulateCriteria.report(self) def format(self, value): if not DNS: return value try: return DNS.revloopup(value) except (IndexError, DNS.Base.DNSError): return value return log.client_domain class CountryCount(AccumulateCriteria): require = ('client_country_code',) def criteria(self, log): return log.client_country_code def format(self, value): try: return '%s (%s)' % (self.country_map[value], value) except KeyError: return value def read_countries(cls): cls.country_map = {} f = open(os.path.join(os.path.dirname(__file__), 'country_codes.txt')) for line in f: if not line.strip() or line.strip()[0] == '#': continue code, name = line.strip().split(None, 1) cls.country_map[code.upper()] = name read_countries = classmethod(read_countries) CountryCount.read_countries() class SearchEngineCount(AccumulateCriteria): require = ('search_engine',) def criteria(self, log): return log.search_engine def format(self, value): return '%s' % (value.url, value.name) class KeywordCount(AccumulateCriteria): require = ('search_engine', 'search_keywords',) def __init__(self, **kw): AccumulateCriteria.__init__(self, **kw) self.seen_keywords = {} def criteria(self, log): if log.search_engine: self.seen_keywords.setdefault( log.search_keywords, log.search_keywords_orig) return log.search_keywords else: return None def format(self, value): return (' '.join(self.seen_keywords[value]) or None) class EntryPageCount(AccumulateCriteria): require = ('session_count', 'canonical_url') def criteria(self, log): if log.session_count == 0: return log.canonical_url def format(self, value): return make_link(value) # @@: ExitPageCount not yet possible (due to pipelining) class PageHitCount(AccumulateCriteria): require = ('canonical_url',) def criteria(self, log): return log.canonical_url def format(self, value): return make_link(value) # @@: lengthy visits class CountHTTPErrors(AccumulateCriteria): include_url = False require = ('status', 'canonical_url') configurable = AccumulateCriteria.configurable + [ api.Configurable('include_url', bool, 'Include URL in error report')] normal = (200, 304, 303, 302, 301, 206) def criteria(self, log): if log.status in self.normal: return None if self.include_url: return (log.status, log.canonical_url) else: return log.status def format(self, value): if self.include_url: return '%s %s %s' % ( value[0], httpcodes.http_error_codes.get(value[0], ''), make_link(value[1])) else: return '%s %s' % (value, httpcodes.http_error_codes.get(value, '')) class Progress(Watcher): show_every = 100 count = 0 byte_length = None give_report = False start_time = 0 end_time = 0 configurable = Watcher.configurable + [ api.Configurable('show_every', int, 'Show progress this often'), api.Configurable('give_report', bool, 'Give a report of records and time'), ] def __call__(self, log, now=time.time): if not self.start_time: self.start_time = now() end_time = now() self.count += 1 if not self.count % self.show_every: if self.byte_length is None: self.get_byte_length(log) current = log._fd.tell() percent = current * 100.0 / self.byte_length sys.stderr.write('%3i%% %6i %3iMb/%3iMb\r' % (percent, self.count, current/1048576, self.byte_length/1048576)) sys.stderr.flush() def get_byte_length(self, log): fd = log._fd self.byte_length = os.stat(fd.name).st_size def report(self): if not self.give_report: return None return """ Total time: %s min (%0.1f hour)
\n Records: %s """ % ((self.end_time-self.start_time)/60, (self.end_time-self.start_time)/360, self.count)