Menu

Why is my memory usage climbing so quickly?

Help
2005-02-12
2013-04-27
  • Sit Ubu Sit

    Sit Ubu Sit - 2005-02-12

    I'm running a 17MB HTML document through HTMLParser. I use Lexer.nextNode() in a loop to iterate over the entire document, writing it verbatim to an OutputStream except when I recognize a particular sequence of tags, which I then write out in a custom manner.

    If I leave my maximum heap size at 64 MB, the program gives up with an OutOfMemoryError before finishing the transformation process. I can't account for this -- is there any good reason the Lexer should accumulate heap space as it progresses through the document?

    Source code follows:

    -----

    import org.htmlparser.Node;
    import org.htmlparser.Tag;
    import org.htmlparser.lexer.Lexer;
    import org.htmlparser.lexer.Page;
    import org.htmlparser.util.ParserException;

    import java.io.*;
    import java.util.Date;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;

    public class ReportOutputFilter {

       private static final Pattern HrefPattern = Pattern.compile("row_number");

       public static void filterReportStream(InputStream in, OutputStream out)
               throws IOException
          {
             int success = 0;
             long rowCount = 0;
             long depth = -1;
             Page page = new Page(in, "UTF-8");
             Lexer lexer = new Lexer(page);
             Node node;
             Tag tag;
             Tag trTag = null, tdTag = null, aTag = null, imgTag = null;
             String tagName;
             Date startDate, endDate;
             long elapsed;

             BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out));

             try {
                startDate = new Date();

                while (null != (node = lexer.nextNode())) {
                   if (node instanceof Tag) {
                      tag = (Tag) node;
                      tagName = tag.getTagName();

                      if (tag.isEndTag()) {
                         writeNode(writer, depth, node, false);
                         depth--;
                         continue;
                      }

                      depth++;

                      if ("TR".equals(tagName)) {
                         trTag = tag;
                         success = 1;
                      }
                      else if ("TD".equals(tagName) && success == 1) {
                         tdTag = tag;
                         success = 2;
                      }
                      else if ("A".equals(tagName) && success == 2) {
                         aTag = tag;
                         success = 3;
                      }
                      else if ("IMG".equals(tagName) && success == 3) {
                         imgTag = tag;
                         // Full success, row tags found
                         // Write the whole bunch of opener tags to the output

                         writeReportRowOpeners(writer, depth, rowCount, trTag, tdTag, aTag, imgTag);
                         rowCount++;

                         trTag = tdTag = aTag = imgTag = null;
                         success = 0;
                      }
                      else {
                         trTag = tdTag = aTag = imgTag = null;
                         success = 0;
                         writeNode(writer, depth, node, false);
                      }

                      if (tag.isEmptyXmlTag() || tagName.equals("META") || tagName.equals("IMG"))
                         depth--;
                   }
                   else {
                      writeNode(writer, depth, node, false);
                   }
                }

                endDate = new Date();
                elapsed = endDate.getTime() - startDate.getTime();
                System.out.println("Elapsed time PREPARING nodes: " + elapsed + " milliseconds.");

             }
             catch (ParserException pe) {
                System.err.println("F***! A problem occurred while parsing.");
             }

             writer.flush();
          }

       private static void writeNode(Writer writer, long depth, Node node, boolean newline)
               throws IOException
          {
             writer.write(node.toHtml());
             if (newline) writer.write("\n");
          }

       private static void writeReportRowOpeners(Writer writer, long depth, long rowCount, Tag trTag, Tag tdTag, Tag aTag, Tag imgTag)
               throws IOException
          {
             String fixedHref;

             Matcher m = HrefPattern.matcher(aTag.getAttribute("href"));
             fixedHref = m.replaceFirst(String.valueOf(rowCount));

             writer.write("<tr>\n");
             writer.write("<td style=\"" + tdTag.getAttribute("style") + "\" class=\"" + tdTag.getAttribute("class") + "\">\n");
             writer.write("<a href=\"" + fixedHref + "\" style=\"" + aTag.getAttribute("style") + "\" class=\"" + aTag.getAttribute("class") + "\">\n");
             writer.write("<img name=\"" + imgTag.getAttribute("name") + "\" id=\"reportRowDrill" + rowCount + "\" />\n");
          }

       public static void main(String[] args)
          {
             try {
                FileInputStream fis = new FileInputStream("C:\\Documents and Settings\\Administrator\\Desktop\\Purchase Transactions.htm");

                FileOutputStream fos = new FileOutputStream("C:\\Documents and Settings\\Administrator\\Desktop\\Output.htm");

                ReportOutputFilter.filterReportStream(fis, fos);

                fis.close();
                fos.close();
             }
             catch (IOException ioe) {
                System.err.println("IOException.");
             }
          }

    }

     
    • Sit Ubu Sit

      Sit Ubu Sit - 2005-02-25

      Really? Nobody has any idea why the Lexer appears to increase heap usage in a linear fashion as it traverses longer and longer files? I'm not hanging onto any references. I haven't created a Parser, so there's no parse tree in use. What's going on?

       
      • Derrick Oswald

        Derrick Oswald - 2005-02-25

        The InputStreamSource object held by the lexer contains a large array of all characters read so far (mBuffer). This is probably what you are seeing.

         

Log in to post a comment.

Want the latest updates on software, tech news, and AI?
Get latest updates about software, tech news, and AI from SourceForge directly in your inbox once a month.