#!/usr/bin/python """ File: html2text.py Copyright (C) 2008 Chris Spencer (chrisspen at gmail dot com) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA """ import os, sys, htmllib, formatter, StringIO, re, urllib, HTMLParser, time import htmlentitydefs try: import tidy except ImportError, e: print "You need to install the Python wrapper for TidyLib." raise def unescapeHTMLEntities(text): """Removes HTML or XML character references and entities from a text string. keep &, >, < in the source code. from Fredrik Lundh http://effbot.org/zone/re-sub.htm#unescape-html """ def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text) class TextExtractor(HTMLParser.HTMLParser): """ Attempts to extract the main body of text from an HTML document. This is a messy task, and certain assumptions about the story text must be made: The story text: 1. Is the largest block of text in the document. 2. Sections all exist at the same relative depth. """ dom = [] path = [0] pathBlur = 5 def __init__(self): HTMLParser.HTMLParser.__init__(self) self._ignore = False self._ignorePath = None self._lasttag = None self._depth = 0 self.depthText = {} # path:text self.counting = 0 self.lastN = 0 def handle_starttag(self, tag, attrs): ignore0 = self._ignore tag = tag.lower() if tag in ('script','style','option','ul','li','legend','object','noscript','label'): # 'h1','h2','h3','h4','h5','h6', self._ignore = True attrd = dict(attrs) self._lasttag = tag.lower() self._depth += 1 self.path += [self.lastN] self.lastN = 0 # Ignore footer garbage. if 'id' in attrd and 'footer' in attrd['id'].lower(): self._ignore = True elif 'id' in attrd and 'copyright' in attrd['id'].lower(): self._ignore = True elif 'class' in attrd and 'footer' in attrd['class'].lower(): self.counting = max(self.counting,1) self._ignore = True elif 'class' in attrd and 'copyright' in attrd['class'].lower(): self._ignore = True # If we just started ignoring, then remember the initial path # so we can later know when to start un-ignoring again. if self._ignore and not ignore0: self._ignorePath = tuple(self.path) def handle_startendtag(self, tag, attrs): pass def handle_endtag(self, tag): if self._ignore and tuple(self.path) == self._ignorePath: self._ignore = False self._depth -= 1 self.lastN = self.path.pop() self.lastN += 1 def handle_data(self, data, entity=False): if len(data) > 0 and not self._ignore: # Skip blocks of text beginning with 'copyright', which usually # indicates a copyright notice. if data.strip().lower().startswith('copyright') and not self._ignore: self._ignore = True self._ignorePath = tuple(self.path) return if data: rpath = tuple(self.path[:-self.pathBlur]) self.depthText.setdefault(rpath, []) self.depthText[rpath] += [data] # Allow one more layer below, to include # text inside or tags. # Unfortuantely, this will include a lot of crap # in the page's header and footer, so we'll # prefix this text with '#' and strip these out later. rpath2 = tuple(self.path[:-self.pathBlur-1]) self.depthText.setdefault(rpath2, []) self.depthText[rpath2] += ['#'+data] def handle_charref(self, name): if name.isdigit(): text = unescapeHTMLEntities('&#'+name+';') else: text = unescapeHTMLEntities('&'+name+';') self.handle_data(text, entity=True) def handle_entityref(self, name): self.handle_charref(name) def get_plaintext(self): maxLen,maxPath,maxText,maxTextList = 0,None,'',[] for path,textList in self.depthText.iteritems(): # Strip off header segments, prefixed with a '#'. start = True text = [] for t in textList: if len(t.strip()): if t.startswith('#') and start: continue start = False text.append(t) # Strip off footer segments, prefixed with a '#'. start = True textList = reversed(text) text = [] for t in textList: if len(t.strip()): if t.startswith('#') and start: continue start = False text.append(t) text = reversed(text) text = ''.join(text).replace('#','') text = text.replace(u'\xa0',' ') text = text.replace(u'\u2019',"'") text = re.sub("[\\n\\s]+", " ", text).strip() # Compress whitespace. #text = re.sub("[\W]+", " ", text).strip() # Compress whitespace. maxLen,maxPath,maxText,maxTextList = max((maxLen,maxPath,maxText,maxTextList), (len(text),path,text,textList)) return maxText def error(self,msg): # ignore all errors pass class HTMLParserNoFootNote(htmllib.HTMLParser): """ Ignores link footnotes, image tags, and other useless things. """ textPattern = None path = [0] def handle_starttag(self, tag, attrs, *args): time.sleep(0.5) self.path += [0] if tag == 'script': pass def handle_endtag(self, tag, *args): self.path.pop() self.path[-1] += 1 if tag == 'script': pass def anchor_end(self): if self.anchor: #self.handle_data("[%d]" % len(self.anchorlist)) self.anchor = None def handle_image(self, src, alt, *args): pass def handle_data(self, data): if self.textPattern: data = ' '.join(self.textPattern.findall(data)) htmllib.HTMLParser.handle_data(self, data) def extractFromHTML(html): """ Extracts text from HTML content. """ # create memory file file = StringIO.StringIO() # convert html to text f = formatter.AbstractFormatter(formatter.DumbWriter(file)) p = TextExtractor() p.feed(html) p.close() text = p.get_plaintext() text = re.sub("\s[\(\),;\.\?\!](?=\s)", " ", text).strip() # Remove stand-alone punctuation. text = re.sub("[\n\s]+", " ", text).strip() # Compress whitespace. text = re.sub("\-{2,}", "", text).strip() # Remove consequetive dashes. text = re.sub("\.{2,}", "", text).strip() # Remove consequetive periods. return text def tidyHTML(dirtyHTML): """ Runs an arbitrary HTML string through Tidy. """ file = StringIO.StringIO() options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=1) html = tidy.parseString(dirtyHTML, **options) html.write(file) html = file.getvalue() return html def extractFromURL(url, cache=0, cacheDir='_cache'): """ Extracts text from a URL. """ if cache: if not os.path.isdir(cacheDir): os.makedirs(cacheDir) fn = os.path.join(cacheDir, re.sub('[^a-zA-Z0-9\-_]+', '', url)+'.txt') if os.path.isfile(fn): return open(fn).read() html = urllib.urlopen(url).read() # Convert content to XHTML. html = tidyHTML(html) res = extractFromHTML(html) if cache: fout = open(fn, 'w') fout.write(res.encode('utf-8')) return res if __name__ == '__main__': if len(sys.argv) < 2: print "Usage: %s " % __file__ else: print extractFromURL(sys.argv[1])