""" Routines to get elements from lxml trees """ import re import cgi from lxml.etree import tostring from hreviewparser.parselog import dummy class ElementNotFound(IndexError): """ Raised when an element is expected, but not found """ def get_parent_with_class(el, class_name, log=dummy): """ Gets the first parent of `el` that has the given class (or raise `ElementNotFound`) """ parent = el regex = re.compile(r'\b%s\b' % class_name) while parent: class_name = parent.attrib.get('class') if class_name and regex.search(class_name): return parent parent = parent.getparent() raise ElementNotFound("No parent of %r with class_name=%r" % (el, class_name)) def get_rel_link(el, rel, log=dummy): """ Get the href and element of a single element inside `el` that is like ````, or raise `ElementNotFound` """ els = get_rel_links(el, rel, log=log) if not els: raise ElementNotFound() if len(els) > 1: log.warn( 'Multiple found: %s' % (rel, ', '.join(map(tostring, els)))) link_el = els[0] link = link_el.attrib['href'] return link, link_el def get_rel_links(el, rel, log=dummy): """ Return all links with the given ``rel={rel}``. """ return el.xpath("descendant-or-self::a[@rel='%s']" % rel) def get_single_item(el, name, log=dummy): """ Get a single value and element with the given class name. """ el = get_single_el(el, name, log=log) if el.attrib.get('title'): value = html_quote(el.attrib['title']) else: value = get_contents(el) return value, el def html_quote(s): if s is None: return '' else: return cgi.escape(unicode(s), 1) def get_contents(el): """ Return the contents of the element; either the text, or mixed text+markup for the item. """ parts = [el.text or ''] for part in el.getchildren(): parts.append(tostring(part)) return norm_whitespace(''.join(parts)) def strip_tags(text): """ Strip any tags from a piece of text """ text = re.sub(r'<.*?>', '', text) text = norm_whitespace(text.strip()) return text _whitespace_re = re.compile(r'[\s][\s]+') def norm_whitespace(text): return _whitespace_re.sub(' ', text) def get_single_el(el, class_name, log=dummy): """ Get a single element with the given class name. """ els = get_elements_by_class(el, class_name) if not els: raise ElementNotFound( "No element with class %r" % class_name) if len(els) > 1: log.warn( "Multiple elements found with class %r: %s", class_name, ', '.join([tostring(el).strip().replace('\n', '') for el in els])) return els[0] def contains_class_xpath(class_names, prefix="descendant-or-self::*"): """ Returns the xpath expression for elements that contain the class name. """ if isinstance(class_names, basestring): class_names = [class_names] expr = ' or '.join( ["contains(concat(' ', translate(@class, '\n\r\t', ' '), ' '), ' %s ')" % class_name for class_name in class_names]) return "%s[%s]" % (prefix, expr) def get_elements_by_class(node, class_names): """ Returns all elements with any of the given class names. A single class_name may also be passed in. """ return node.xpath(contains_class_xpath(class_names))