--- /dev/null
+#!/usr/bin/python
+# -*- coding: iso-8859-1 -*-
+# Hello, this program is written in Python - http://python.org
+programname = 'html2tsv - version 2002-09-20 - http://sebsauvage.net'
+
+import sys, getopt, os.path, glob, HTMLParser, re
+
+try: import psyco ; psyco.jit() # If present, use psyco to accelerate the program
+except: pass
+
+def usage(progname):
+ ''' Display program usage. '''
+ progname = os.path.split(progname)[1]
+ if os.path.splitext(progname)[1] in ['.py','.pyc']: progname = 'python '+progname
+ return '''%s
+A coarse HTML tables to TSV (Tab-Separated Values) converter.
+
+Syntax : %s source.html
+
+Arguments : source.html is the HTML file you want to convert to TSV.
+ By default, the file will be converted to tsv with the same
+ name and the tsv extension (source.html -> source.tsv)
+ You can use * and ?.
+
+Examples : %s mypage.html
+ : %s *.html
+
+This program is public domain.
+Author : Sebastien SAUVAGE <sebsauvage at sebsauvage dot net>
+ http://sebsauvage.net
+''' % (programname, progname, progname, progname)
+
+class html2tsv(HTMLParser.HTMLParser):
+ ''' A basic parser which converts HTML tables into TSV.
+ Feed HTML with feed(). Get TSV with getTSV(). (See example below.)
+ All tables in HTML will be converted to TSV (in the order they occur
+ in the HTML file).
+ You can process very large HTML files by feeding this class with chunks
+ of html while getting chunks of TSV by calling getTSV().
+ Should handle badly formated html (missing <tr>, </tr>, </td>,
+ extraneous </td>, </tr>...).
+ This parser uses HTMLParser from the HTMLParser module,
+ not HTMLParser from the htmllib module.
+ Example: parser = html2tsv()
+ parser.feed( open('mypage.html','rb').read() )
+ open('mytables.tsv','w+b').write( parser.getTSV() )
+ This class is public domain.
+ Author: Sébastien SAUVAGE <sebsauvage at sebsauvage dot net>
+ http://sebsauvage.net
+ Versions:
+ 2002-09-19 : - First version
+ 2002-09-20 : - now uses HTMLParser.HTMLParser instead of htmllib.HTMLParser.
+ - now parses command-line.
+ To do:
+ - handle <PRE> tags
+ - convert html entities (&name; and &#ref;) to Ascii.
+ '''
+ def __init__(self):
+ HTMLParser.HTMLParser.__init__(self)
+ self.TSV = '' # The TSV data
+ self.TSVrow = '' # The current TSV row beeing constructed from HTML
+ self.inTD = 0 # Used to track if we are inside or outside a <TD>...</TD> tag.
+ self.inTR = 0 # Used to track if we are inside or outside a <TR>...</TR> tag.
+ self.re_multiplespaces = re.compile('\s+') # regular expression used to remove spaces in excess
+ self.rowCount = 0 # TSV output line counter.
+ def handle_starttag(self, tag, attrs):
+ if tag == 'tr': self.start_tr()
+ elif tag == 'td': self.start_td()
+ def handle_endtag(self, tag):
+ if tag == 'tr': self.end_tr()
+ elif tag == 'td': self.end_td()
+ def start_tr(self):
+ if self.inTR: self.end_tr() # <TR> implies </TR>
+ self.inTR = 1
+ def end_tr(self):
+ if self.inTD: self.end_td() # </TR> implies </TD>
+ self.inTR = 0
+ if len(self.TSVrow) > 0:
+ self.TSV += self.TSVrow[:-1]
+ self.TSVrow = ''
+ self.TSV += '\n'
+ self.rowCount += 1
+ def start_td(self):
+ if not self.inTR: self.start_tr() # <TD> implies <TR>
+ self.TSVrow += ''
+ self.inTD = 1
+ def end_td(self):
+ if self.inTD:
+ self.TSVrow += '\t'
+ self.inTD = 0
+ def handle_data(self, data):
+ if self.inTD:
+ self.TSVrow += self.re_multiplespaces.sub(' ',data.replace('\t',' ').replace('\n','').replace('\r','').replace('"','""'))
+ def getTSV(self,purge=False):
+ ''' Get output TSV.
+ If purge is true, getTSV() will return all remaining data,
+ even if <td> or <tr> are not properly closed.
+ (You would typically call getTSV with purge=True when you do not have
+ any more HTML to feed and you suspect dirty HTML (unclosed tags). '''
+ if purge and self.inTR: self.end_tr() # This will also end_td and append last TSV row to output TSV.
+ dataout = self.TSV[:]
+ self.TSV = ''
+ return dataout
+
+
+if __name__ == "__main__":
+ try: # Put getopt in place for future usage.
+ opts, args = getopt.getopt(sys.argv[1:],None)
+ except getopt.GetoptError:
+ print usage(sys.argv[0]) # print help information and exit:
+ sys.exit(2)
+ if len(args) == 0:
+ print usage(sys.argv[0]) # print help information and exit:
+ sys.exit(2)
+ print programname
+ html_files = glob.glob(args[0])
+ for htmlfilename in html_files:
+ outputfilename = os.path.splitext(htmlfilename)[0]+'.tsv'
+ parser = html2tsv()
+ print 'Reading %s, writing %s...' % (htmlfilename, outputfilename)
+ try:
+ htmlfile = open(htmlfilename, 'rb')
+ tsvfile = open( outputfilename, 'w+b')
+ data = htmlfile.read(8192)
+ while data:
+ parser.feed( data )
+ tsvfile.write( parser.getTSV() )
+ sys.stdout.write('%d TSV rows written.\r' % parser.rowCount)
+ data = htmlfile.read(8192)
+ tsvfile.write( parser.getTSV(True) )
+ tsvfile.close()
+ htmlfile.close()
+ except:
+ print 'Error converting %s ' % htmlfilename
+ try: htmlfile.close()
+ except: pass
+ try: tsvfile.close()
+ except: pass
+ print 'All done. '