#!/usr/bin/python
from sgmllib import SGMLParser
import re
import string
class htmlCleaner( SGMLParser ):
def __init__( self ):
SGMLParser.__init__( self )
self.cleanedHTML = ''
self.cleanedText = ''
def handle_data( self, data ):
self.cleanedText += data
self.cleanedHTML += string.replace(
re.sub(
r'(http://[^\r\n \"\<]+)',
r'\1',
data,
),
"\n",
"
\n",
)
def handle_starttag( self, tag, method, attrs ):
if not method( attrs ):
self.cleanedHTML += "<" + tag + ">"
def handle_endtag( self, tag, method ):
if not method():
self.cleanedHTML += "" + tag + ">"
def start_i( self, attrs ): pass
def end_i( self ): pass
def start_b( self, attrs ): pass
def end_b( self ): pass
def start_s( self, attrs ): pass
def end_s( self ): pass
def start_tt( self, attrs ): pass
def end_tt( self ): pass
def start_a( self, attrs ):
f=1
for key, val in attrs:
if key == 'href':
self.cleanedHTML += '' % ( val, )
f=0
if f:
self.cleanedHTML += ''
return 1
def end_a( self ):
self.cleanedHTML += ''
return 1
def cleanHtml( text ):
parser = htmlCleaner()
parser.feed( text )
parser.close()
return parser.cleanedHTML
if __name__ == '__main__':
text = [
"""I'm writing my Radio Klogging Kit for Managers as an OPML file with a link on my site using your servlet. I have a pointer to the opml in my Instant Outline. Does the polling of my i/o cascade to xref'd outlines? """,
"""It looks like someone's subscribed to the rendered form of your outline. People should be subscribing to the raw OPML version - http://rcs.myelin.cjb.net/users/0000001/instantOutliner/rogersCadenhead.opml - but actually they're subscribing to the one that calls your servlet.
Your outline is currently the most popular file on this server, because you plus one or two others are downloading it every 10-60 seconds. I can't imagine the hammering radio.weblogs.com must be getting from all the I/O polling, but it must be pretty shocking.""",
]
for post in text:
print "PARSING:"
print post
print "--->"
print cleanHtml( post )