# -*- coding: utf-8 -*- import urllib, urllib2 import htmlentitydefs import re, sys def get_page(url, values={}): data = urllib.urlencode(values) req = urllib2.Request(url + data) req.add_header('User-Agent', 'Mozilla/5.0') return urllib2.urlopen(req).read() def no_tags(s): return re.sub("<[^>]+>", "", s) def convert_entity(m): if m.group(1)=='#': try: return chr(int(m.group(2))) except ValueError: return '%s;' % m.group(2) try: return htmlentitydefs.entitydefs[m.group(2)] except KeyError: return '&%s;' % m.group(2) def unquote_html(s): return re.sub(r'&(#?)(.+?);',convert_entity,s) if __name__ == '__main__': # Get a random subject from Wikipedia try: wiki_page = get_page('http://en.wikipedia.org/wiki/Special:Random') except: print 'Offline?' sys.exit() try: wiki_title = re.findall('\