IRToolkit / Discussion / Python: convert HTML to Text or extract plain text from structured data

#!/usr/bin/env python
"""Process a directory, find all text files from the current directory and its subdirectories recursively and extract plain text from each file and create and output file in the corresponding output directory."""
__version__ = "0.1"
__author__ = "Duy Dinh"
__copyright__ = "(C) 20014-2014 CRP Henri Tudor."
__contributors__ = ["Duy Dinh"]
# ------------------------------------IMPORT --------------------------------------------------------------
# import built-in libraries
import itertools as it
import sys,  time,  os,  fnmatch
import optparse # parse options and arguments

# import (attached user-defined) libraries or modules
import shutils
import html2text

# import classes from libraries
from sys import stdout
# --------------------------------------------------------------------------------------------------------------

# ----------------------------GLOBAL VARIABLES--------------------------------------------------
# program options
PROCESSED_DOCUMENTS = 0
# --------------------------------------------------------------------------------------------------------------


# Python class name
class extractPlainText: pass
# initialize options
options = extractPlainText()

# find files in a directory given a file pattern
def find_files(directory, pattern):
    for root, dirs, files in os.walk(directory):
        for basename in files:
            if fnmatch.fnmatch(basename, pattern):
                filename = os.path.join(root, basename)
                yield filename


# process each file found
def processFile(fn,  output,  encoding): # input file name, output file name
        baseurl = None
        if fn.startswith('http://') or fn.startswith('https://'):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            text = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, text)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
            data = text.decode(encoding)
        else:
            data = open(fn, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                    encoding = detect(data)['encoding']
                    data = data.decode(encoding)
        text = html2text.html2text(data, baseurl)
        #wrapwrite(text)
        text = text.encode('utf-8')
        # write to output file
        if (output is not None):
            print "**** Writing plain text to file '%s'" % output
            html2text.writeToFile(text, output)
        else:
            print text

# get the sub directory name comparing outputBaseDir and inputBaseDir
def getSubDirectoryName(inputBaseDir, outputBaseDir,  inputFile): # original base directory
    subDirName = None

    # find common string between input Base Dir and output Base Dir
    commonDir = ''.join(el[0] for el in it.takewhile(lambda t: t[0] == t[1], zip(inputBaseDir, outputBaseDir))) 
    #print commonDir

    # find the difference string between input base dir and current input dir of file
    inputDir = os.path.dirname(inputFile)       
    if (commonDir <> inputDir):
        start=len(commonDir)
        end=len(inputDir)
        subDirName = inputDir[start:end]   
    # end if     
    return subDirName

# create the hierarchical folder structure for output 
def makeOutputFile(inputFile,  subDirectory,  outputDir):    
    # get file name
    filename=os.path.basename(inputFile)
    if (subDirectory is not None):
        outputDir = outputDir + subDirectory 
        if (not os.path.exists(outputDir)):
            print "Making directory '%s'" % outputDir
            os.makedirs(outputDir)

        outputFileName = outputDir + "/" + filename
    else:
        outputFileName = outputDir + "/" + filename        
    return outputFileName

# check input options
def checkOptions(inputDirectory,  outputDirectory,  encoding,  file_pattern):
    valid=1
    global PROCESSED_DOCUMENTS
    PROCESSED_DOCUMENTS = 0
    if (inputDirectory is not None and not os.path.exists(inputDirectory)):
        valid=0
        print "*** Input directory '%s' does not exist!" % inputDirectory        
    # end if     
    if (outputDirectory is not None):
        if (not os.path.exists(outputDirectory)):
            valid=0
            print "*** Output directory '%s' does not exists!" % outputDirectory
            print "Would you like to create output directory ? (y|n)"        
            line = sys.stdin.readline()
            line = line.strip()
            #print line
            if (line is "y" or line is "Y"):                
                os.makedirs(outputDirectory)               
                valid = 1
            # end if    
        else:
            # compute the number of documents that have been already processed
                for input_file in find_files(outputDirectory, file_pattern):
                    PROCESSED_DOCUMENTS = PROCESSED_DOCUMENTS + 1
                if (PROCESSED_DOCUMENTS > 0):    
                    print "There are %d documents already processed!" % PROCESSED_DOCUMENTS
                    print "Would you like to resume the process? (y|n)"
                    line = sys.stdin.readline()
                    line = line.strip()            
                    if (not (line  is "y" or line is "Y")):                    
                        print "Remove output directory '%s' ? All content from this folder will be deleted. Please confirm deletion (y|n)" % outputDirectory
                        line = sys.stdin.readline()
                        line = line.strip()
                        if (line is "y" or line is "Y"):
                            shutils.rmtree(outputDirectory)
                            os.makedirs(outputDirectory)    
                            # reset the number of processed documents
                            PROCESSED_DOCUMENTS = 0
                        #end if
                    # end if    
            # end if    
        # end if 
    # end if    
    #print valid
    return valid

def printCompletionTime(startTime,  endTime):
    time_taken = endTime - startTime
    hours, rest = divmod(time_taken,3600)
    minutes, seconds = divmod(rest, 60)
    hLabel = "hour"
    mLabel = "minute"
    sLabel = "second"
    strFormat = "%d %s"    
    stdout.write("Completion time: ")
    if (hours >= 1):
        hLabel = hLabel + "s"        
        stdout.write("%d %s" % (hours,  hLabel))
    if (minutes >= 1):
        mLabel = mLabel + "s"        
        stdout.write("%d %s" % (minutes,  mLabel)) 
    if (seconds >= 1):
        sLabel = sLabel + "s"
    stdout.write("%d %s" % (seconds,  sLabel))        
# main
if __name__ == "__main__":    
    # build options
    p = optparse.OptionParser('%prog [-i|--input] [INPUTDIRECTORY] [-o|--output OUTPUTDIRECTORY] [-e|--encoding ENCODING] [-p|pattern FILEPATTERN]', version='%prog ' + __version__)
    p.add_option("-i",  "--input",  help="Input directory")
    p.add_option("-o", "--output", help="output directory")
    p.add_option("-e",  "--encoding", default="utf8",  help="file encoding, e.g., 'ascii', 'utf8', etc.")
    p.add_option("-p",  "--pattern",  default="*.*",  help="file pattern, e.g., '*.txt', '*.xml', etc.")
    #p.add_option("-h",  "--help",  help="print this help message and exit")
    # parse options and arguments
    (options, args) = p.parse_args()

    if (len(args) >= 1):        
        # analyze options
        inputDirectory=args[0]
        if (inputDirectory is None):
            inputDirectory = options.input

        encoding = options.encoding        
        file_pattern=options.pattern        
        outputDirectory = options.output
        if (not  checkOptions(inputDirectory,  outputDirectory,  encoding,  file_pattern)):
            p.error("Please verify the options and arguments!!!")

        if (not inputDirectory.endswith("/")):
            inputDirectory = inputDirectory + "/"
        if (outputDirectory is not None):    
            if (not outputDirectory.endswith("/")):
                outputDirectory = outputDirectory + "/"
        # end analyze options

        # Recursively find all extension (e.g. *.txt, *.pdf) files in input directory    
        totalDocs = 0
        for input_file in find_files(inputDirectory, file_pattern):
            totalDocs = totalDocs + 1

        counter = 0    
        startTime = time.time()
        for input_file in find_files(inputDirectory, file_pattern):
            if (counter <= PROCESSED_DOCUMENTS):
                counter = counter + 1;
                continue

            # process each file
            print "[%d] Extracting plain text from file '%s'" % (counter,  input_file)
            # create output file
            output_file = None
            if (outputDirectory is not None):
                subDirectoryName=getSubDirectoryName(inputDirectory,  outputDirectory,  input_file)
                output_file = makeOutputFile(input_file,  subDirectoryName,  outputDirectory)

            processFile(input_file,  output_file,  encoding)

            if (counter % 1000 == 0):
                print "**** Task progress : [%.2f %%]" % ((100.0 * counter)/totalDocs)
                time.sleep(2)                
            counter = counter + 1
        # end for
        print "%d/%s files processed." % (counter,  totalDocs)
        endTime = time.time()
        printCompletionTime(startTime,  endTime)
    else:
        p.print_help()
        p.error('*** Input directory must be specified!!')
Last edit: Duy Dinh 2014-05-08
html2text.py
shutils.py
convert HTML to Text or extract plain text from structured data

IRToolkit

Forums

Help

convert HTML to Text or extract plain text from structured data

convert HTML to Text or extract plain text from structured data

IRToolkit

Forums

Help

convert HTML to Text or extract plain text from structured data document.SUBSCRIPTION_OPTIONS = { "thing": "topic", "subscribed": false, "url": "subscribe", "icon": { "css": "fa fa-envelope-o" } };

convert HTML to Text or extract plain text from structured data