I'm running a 17MB HTML document through HTMLParser. I use Lexer.nextNode() in a loop to iterate over the entire document, writing it verbatim to an OutputStream except when I recognize a particular sequence of tags, which I then write out in a custom manner.
If I leave my maximum heap size at 64 MB, the program gives up with an OutOfMemoryError before finishing the transformation process. I can't account for this -- is there any good reason the Lexer should accumulate heap space as it progresses through the document?
endDate = new Date();
elapsed = endDate.getTime() - startDate.getTime();
System.out.println("Elapsed time PREPARING nodes: " + elapsed + " milliseconds.");
}
catch (ParserException pe) {
System.err.println("F***! A problem occurred while parsing.");
}
writer.flush();
}
private static void writeNode(Writer writer, long depth, Node node, boolean newline)
throws IOException
{
writer.write(node.toHtml());
if (newline) writer.write("\n");
}
private static void writeReportRowOpeners(Writer writer, long depth, long rowCount, Tag trTag, Tag tdTag, Tag aTag, Tag imgTag)
throws IOException
{
String fixedHref;
Matcher m = HrefPattern.matcher(aTag.getAttribute("href"));
fixedHref = m.replaceFirst(String.valueOf(rowCount));
Really? Nobody has any idea why the Lexer appears to increase heap usage in a linear fashion as it traverses longer and longer files? I'm not hanging onto any references. I haven't created a Parser, so there's no parse tree in use. What's going on?
If you would like to refer to this comment somewhere else in this project, copy and paste the following link:
I'm running a 17MB HTML document through HTMLParser. I use Lexer.nextNode() in a loop to iterate over the entire document, writing it verbatim to an OutputStream except when I recognize a particular sequence of tags, which I then write out in a custom manner.
If I leave my maximum heap size at 64 MB, the program gives up with an OutOfMemoryError before finishing the transformation process. I can't account for this -- is there any good reason the Lexer should accumulate heap space as it progresses through the document?
Source code follows:
-----
import org.htmlparser.Node;
import org.htmlparser.Tag;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.ParserException;
import java.io.*;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ReportOutputFilter {
private static final Pattern HrefPattern = Pattern.compile("row_number");
public static void filterReportStream(InputStream in, OutputStream out)
throws IOException
{
int success = 0;
long rowCount = 0;
long depth = -1;
Page page = new Page(in, "UTF-8");
Lexer lexer = new Lexer(page);
Node node;
Tag tag;
Tag trTag = null, tdTag = null, aTag = null, imgTag = null;
String tagName;
Date startDate, endDate;
long elapsed;
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out));
try {
startDate = new Date();
while (null != (node = lexer.nextNode())) {
if (node instanceof Tag) {
tag = (Tag) node;
tagName = tag.getTagName();
if (tag.isEndTag()) {
writeNode(writer, depth, node, false);
depth--;
continue;
}
depth++;
if ("TR".equals(tagName)) {
trTag = tag;
success = 1;
}
else if ("TD".equals(tagName) && success == 1) {
tdTag = tag;
success = 2;
}
else if ("A".equals(tagName) && success == 2) {
aTag = tag;
success = 3;
}
else if ("IMG".equals(tagName) && success == 3) {
imgTag = tag;
// Full success, row tags found
// Write the whole bunch of opener tags to the output
writeReportRowOpeners(writer, depth, rowCount, trTag, tdTag, aTag, imgTag);
rowCount++;
trTag = tdTag = aTag = imgTag = null;
success = 0;
}
else {
trTag = tdTag = aTag = imgTag = null;
success = 0;
writeNode(writer, depth, node, false);
}
if (tag.isEmptyXmlTag() || tagName.equals("META") || tagName.equals("IMG"))
depth--;
}
else {
writeNode(writer, depth, node, false);
}
}
endDate = new Date();
elapsed = endDate.getTime() - startDate.getTime();
System.out.println("Elapsed time PREPARING nodes: " + elapsed + " milliseconds.");
}
catch (ParserException pe) {
System.err.println("F***! A problem occurred while parsing.");
}
writer.flush();
}
private static void writeNode(Writer writer, long depth, Node node, boolean newline)
throws IOException
{
writer.write(node.toHtml());
if (newline) writer.write("\n");
}
private static void writeReportRowOpeners(Writer writer, long depth, long rowCount, Tag trTag, Tag tdTag, Tag aTag, Tag imgTag)
throws IOException
{
String fixedHref;
Matcher m = HrefPattern.matcher(aTag.getAttribute("href"));
fixedHref = m.replaceFirst(String.valueOf(rowCount));
writer.write("<tr>\n");
writer.write("<td style=\"" + tdTag.getAttribute("style") + "\" class=\"" + tdTag.getAttribute("class") + "\">\n");
writer.write("<a href=\"" + fixedHref + "\" style=\"" + aTag.getAttribute("style") + "\" class=\"" + aTag.getAttribute("class") + "\">\n");
writer.write("<img name=\"" + imgTag.getAttribute("name") + "\" id=\"reportRowDrill" + rowCount + "\" />\n");
}
public static void main(String[] args)
{
try {
FileInputStream fis = new FileInputStream("C:\\Documents and Settings\\Administrator\\Desktop\\Purchase Transactions.htm");
FileOutputStream fos = new FileOutputStream("C:\\Documents and Settings\\Administrator\\Desktop\\Output.htm");
ReportOutputFilter.filterReportStream(fis, fos);
fis.close();
fos.close();
}
catch (IOException ioe) {
System.err.println("IOException.");
}
}
}
Really? Nobody has any idea why the Lexer appears to increase heap usage in a linear fashion as it traverses longer and longer files? I'm not hanging onto any references. I haven't created a Parser, so there's no parse tree in use. What's going on?
The InputStreamSource object held by the lexer contains a large array of all characters read so far (mBuffer). This is probably what you are seeing.