#!/usr/bin/python
"""
File: html2text.py
Copyright (C) 2008 Chris Spencer (chrisspen at gmail dot com)
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
"""
import os, sys, htmllib, formatter, StringIO, re, urllib, HTMLParser, time
import htmlentitydefs
try:
import tidy
except ImportError, e:
print "You need to install the Python wrapper for TidyLib."
raise
def unescapeHTMLEntities(text):
"""Removes HTML or XML character references
and entities from a text string.
keep &, >, < in the source code.
from Fredrik Lundh
http://effbot.org/zone/re-sub.htm#unescape-html
"""
def fixup(m):
text = m.group(0)
if text[:2] == "":
# character reference
try:
if text[:3] == "":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("?\w+;", fixup, text)
class TextExtractor(HTMLParser.HTMLParser):
"""
Attempts to extract the main body of text from an HTML document.
This is a messy task, and certain assumptions about the story text must be made:
The story text:
1. Is the largest block of text in the document.
2. Sections all exist at the same relative depth.
"""
dom = []
path = [0]
pathBlur = 5
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self._ignore = False
self._ignorePath = None
self._lasttag = None
self._depth = 0
self.depthText = {} # path:text
self.counting = 0
self.lastN = 0
def handle_starttag(self, tag, attrs):
ignore0 = self._ignore
tag = tag.lower()
if tag in ('script','style','option','ul','li','legend','object','noscript','label'): # 'h1','h2','h3','h4','h5','h6',
self._ignore = True
attrd = dict(attrs)
self._lasttag = tag.lower()
self._depth += 1
self.path += [self.lastN]
self.lastN = 0
# Ignore footer garbage.
if 'id' in attrd and 'footer' in attrd['id'].lower():
self._ignore = True
elif 'id' in attrd and 'copyright' in attrd['id'].lower():
self._ignore = True
elif 'class' in attrd and 'footer' in attrd['class'].lower():
self.counting = max(self.counting,1)
self._ignore = True
elif 'class' in attrd and 'copyright' in attrd['class'].lower():
self._ignore = True
# If we just started ignoring, then remember the initial path
# so we can later know when to start un-ignoring again.
if self._ignore and not ignore0:
self._ignorePath = tuple(self.path)
def handle_startendtag(self, tag, attrs):
pass
def handle_endtag(self, tag):
if self._ignore and tuple(self.path) == self._ignorePath:
self._ignore = False
self._depth -= 1
self.lastN = self.path.pop()
self.lastN += 1
def handle_data(self, data, entity=False):
if len(data) > 0 and not self._ignore:
# Skip blocks of text beginning with 'copyright', which usually
# indicates a copyright notice.
if data.strip().lower().startswith('copyright') and not self._ignore:
self._ignore = True
self._ignorePath = tuple(self.path)
return
if data:
rpath = tuple(self.path[:-self.pathBlur])
self.depthText.setdefault(rpath, [])
self.depthText[rpath] += [data]
# Allow one more layer below, to include
# text inside or tags.
# Unfortuantely, this will include a lot of crap
# in the page's header and footer, so we'll
# prefix this text with '#' and strip these out later.
rpath2 = tuple(self.path[:-self.pathBlur-1])
self.depthText.setdefault(rpath2, [])
self.depthText[rpath2] += ['#'+data]
def handle_charref(self, name):
if name.isdigit():
text = unescapeHTMLEntities(''+name+';')
else:
text = unescapeHTMLEntities('&'+name+';')
self.handle_data(text, entity=True)
def handle_entityref(self, name):
self.handle_charref(name)
def get_plaintext(self):
maxLen,maxPath,maxText,maxTextList = 0,None,'',[]
for path,textList in self.depthText.iteritems():
# Strip off header segments, prefixed with a '#'.
start = True
text = []
for t in textList:
if len(t.strip()):
if t.startswith('#') and start:
continue
start = False
text.append(t)
# Strip off footer segments, prefixed with a '#'.
start = True
textList = reversed(text)
text = []
for t in textList:
if len(t.strip()):
if t.startswith('#') and start:
continue
start = False
text.append(t)
text = reversed(text)
text = ''.join(text).replace('#','')
text = text.replace(u'\xa0',' ')
text = text.replace(u'\u2019',"'")
text = re.sub("[\\n\\s]+", " ", text).strip() # Compress whitespace.
#text = re.sub("[\W]+", " ", text).strip() # Compress whitespace.
maxLen,maxPath,maxText,maxTextList = max((maxLen,maxPath,maxText,maxTextList), (len(text),path,text,textList))
return maxText
def error(self,msg):
# ignore all errors
pass
class HTMLParserNoFootNote(htmllib.HTMLParser):
"""
Ignores link footnotes, image tags, and other useless things.
"""
textPattern = None
path = [0]
def handle_starttag(self, tag, attrs, *args):
time.sleep(0.5)
self.path += [0]
if tag == 'script':
pass
def handle_endtag(self, tag, *args):
self.path.pop()
self.path[-1] += 1
if tag == 'script':
pass
def anchor_end(self):
if self.anchor:
#self.handle_data("[%d]" % len(self.anchorlist))
self.anchor = None
def handle_image(self, src, alt, *args):
pass
def handle_data(self, data):
if self.textPattern:
data = ' '.join(self.textPattern.findall(data))
htmllib.HTMLParser.handle_data(self, data)
def extractFromHTML(html):
"""
Extracts text from HTML content.
"""
# create memory file
file = StringIO.StringIO()
# convert html to text
f = formatter.AbstractFormatter(formatter.DumbWriter(file))
p = TextExtractor()
p.feed(html)
p.close()
text = p.get_plaintext()
text = re.sub("\s[\(\),;\.\?\!](?=\s)", " ", text).strip() # Remove stand-alone punctuation.
text = re.sub("[\n\s]+", " ", text).strip() # Compress whitespace.
text = re.sub("\-{2,}", "", text).strip() # Remove consequetive dashes.
text = re.sub("\.{2,}", "", text).strip() # Remove consequetive periods.
return text
def tidyHTML(dirtyHTML):
"""
Runs an arbitrary HTML string through Tidy.
"""
file = StringIO.StringIO()
options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=1)
html = tidy.parseString(dirtyHTML, **options)
html.write(file)
html = file.getvalue()
return html
def extractFromURL(url, cache=0, cacheDir='_cache'):
"""
Extracts text from a URL.
"""
if cache:
if not os.path.isdir(cacheDir):
os.makedirs(cacheDir)
fn = os.path.join(cacheDir, re.sub('[^a-zA-Z0-9\-_]+', '', url)+'.txt')
if os.path.isfile(fn):
return open(fn).read()
html = urllib.urlopen(url).read()
# Convert content to XHTML.
html = tidyHTML(html)
res = extractFromHTML(html)
if cache:
fout = open(fn, 'w')
fout.write(res.encode('utf-8'))
return res
if __name__ == '__main__':
if len(sys.argv) < 2:
print "Usage: %s " % __file__
else:
print extractFromURL(sys.argv[1])