import re import urllib import annotate all_engines = [] class Engine(object): """ A container for engine information. All instances go into the global variable ``all_engines``. """ def __init__(self, name, regexes, url): self.name = name self.url = url self.regexes = [] if not isinstance(regexes, (list, tuple)): regexes = [regexes] for regex in regexes: if isinstance(regex, (str, unicode)): regex = re.compile(regex) self.regexes.append(regex) def engine(*args, **kw): """Make an engine""" all_engines.append(Engine(*args, **kw)) def quick(name, url): """ Make an engine using an abbreviated definition, given a name and a URL. """ if not url.startswith('http://'): url = 'http://' + url real_url = url.split('?')[0].split('/')[0] search_url = '^' + url.split('?')[0] if '?' in url: param_name = url.split('?')[1] search_url = search_url + '.*' + param_name + '=([^&]*)' else: search_url = search_url + '(.*)' all_engines.append(Engine( name, [search_url], real_url)) quick('Google', 'www.google.[a-z.]+/?q') quick('Google Images', 'images.google.com/images?q') engine('Google Image View', [r'images.google.[a-z.]+/images.*&q=([^&]*)', r'(?:images|www).google.[a-z.]+/imgres.*%3Fq%3D(.*?)%26', r'(?:images|www).google.[a-z.]+/imgres.*()'], 'http://images.google.com') engine('Yahoo', [r'search.yahoo.com/(?:bin/)?search.*[?&]p=([^&]*)'], 'http://search.yahoo.com') engine('Yahoo Images', [r'[a-z.]*search.yahoo.[a-z]+/(?:search/)?images/view.*%26(?:va|p)%3D(.*?)%26', r'[a-z.]*search.yahoo.[a-z]+/(?:search/)?images/view.*&p=(.*?)&', r'kr.imagesearch.yahoo.com/imgsrch/av/av_detail()'], 'http://images.search.yahoo.com') engine('Yahoo RD', [r'rds.yahoo.com.*/K=([^/]*)'], 'http://rds.yahoo.com') quick('Metacrawler', 'http://www.metacrawler.com/info.metac/search/web/') quick('Dogpile', 'http://www.dogpile.com/info.dogpl/search/web/') quick('Altavista', 'http://www.altavista.com/web/results?q') quick('Altavista Images', 'http://www.altavista.com/image/results?q') quick('Excite', 'http://msxml.excite.com/info.xcite/search/web/') quick('Freshmeat', 'http://freshmeat.net/search?q') quick('Snap', 'http://www.snap.com/search.php?query') engine('AOL Search', [r'aolsearch.aol.com/aol/search.*[?&]query=([^&]*)', r'sucheaol.aol.de/suche/bilder/search.*[?&]q=([^&]*)'], 'http://aolsearch.aol.com') engine('AOL Images', [r'aolsearch.aol.com/aol/imageDetails.*[?&]query=([^&]*)', r'aolimages.aol.fr/image.*[?&]query=([^&]*)', r'aolsearch.aol.co.uk/image_browse.*[?&]query=([^&]*)'], 'http://aolsearch.aol.com/aol/imagehome') engine('MSN', [r'http://search.msn.com/.*results.asp.*[?&]q=([^&]*)', r'http://sea.search.msn.com/(?:preview|spresults).asp.*[?&]q=([^&]*)', r'http://search.sympatico.msn.ca/results.asp.*[?&]q=([^&]*)', r'http://search.ninemsn.com.au/results.aspx.*[?&]q=([^&]*)'], 'http://search.msn.com') quick('Mamma', 'http://mamma.com/Mamma?query') quick('Voila', 'http://search.ke.voila.fr/S/voila?kw') quick('Compuserve', 'http://websearch.cs.com/cs/search?query') quick('Acoon', 'http://www.acoon.de/cgi-bin/search.exe?begriff') quick('Ask Jeeves', 'http://web.ask.com/web?q') engine('MyWay', [r'mysearch.myway.com/jsp/GGmain.jsp.*[?&]searchfor=([^&]*)'], 'http://mysearch.myway.com') quick('Search.com', 'http://www.search.com/search?q') quick('Lycos', 'http://search.lycos.com/default.asp?query') quick('AllTheWeb', 'http://www.alltheweb.com/search?q') quick('Netscape.com', 'http://search.netscape.com/ns/?query') quick('ICQsearch', 'http://google.icq.com/search/results.php?q') quick('Searchalot', 'http://www.searchalot.com/texis/open/search?q') quick('Go Hip!', 'http://search.gohip.com/nph-general_search.cgi?sc') quick('Sensis.com.au', 'http://www.sensis.com.au/search.do?find') quick('Profusion', 'http://www.profusion.com/results?queryterm') quick('WebTV', 'http://search.bay.webtv.net/?q') quick('Teoma', 'http://s.teoma.com/search?q') quick('Earthlink', 'http://search.earthlink.net/search?q') quick('HotBot', 'http://www.hotbot.com/?query') engine('Vivisimo', [r'http://[a-z.]*vivisimo.com/search.*[&?]query=([^&]*)'], 'http://vivisimo.com') quick('iWon', 'http://search.iwon.com/?searchfor') quick('Seeker Bar', 'http://www.websearch.com/?qkw') quick('Delv UK', 'http://www.delv.co.uk/results.asp?qry') quick('Marsfind', 'http://www.marsfind.com/search.html?Keywords') quick('Viewpoint', 'http://search.viewpoint.com/?k') quick('TheSearchEngine', 'http://www.thesearchengine.org/?src') all_regexes = [] for engine in all_engines: for regex in engine.regexes: all_regexes.append((regex, engine)) class AnnotateSearchEngine(annotate.Annotate): """ Adds an attribute ``search_engine``, which will be the search engine description if the request was referred by a search engine (or None otherwise). If a search engine, adds ``search_query``, which is url-quoted string query. """ require = ('referer',) provides = ('search_engine', 'search_query',) def __call__(self, log): referer = log.referer for regex, engine in all_regexes: match = regex.search(referer) if match: query = match.group(1) log.search_engine = engine log.search_query = query return log log.search_engine = None return log class AnnotateKeywords(annotate.Annotate): """ Adds ``search_keywords``, which is a sorted (alphabetically) list of the words in the ``search_query`` (if any query exists, of course). """ require = ('search_engine', 'search_query') provides = ('search_keywords',) _nonletter_re = re.compile('[^a-zA-Z0-9: ]') def __call__(self, log): if not log.search_engine: return log query = log.search_query query = urllib.unquote(query) query = query.replace('+', ' ') if '%' in query: # @@ Some queries are double-quoted query = urllib.unquote(query) query = self._nonletter_re.sub('', query) query = query.lower() words = filter(None, query.split()) words_orig = words[:] words.sort() words = tuple(words) log.search_keywords = words log.search_keywords_orig = words_orig return log