From: <my...@us...> - 2009-11-04 17:09:28
|
Revision: 2114 http://aperture.svn.sourceforge.net/aperture/?rev=2114&view=rev Author: mylka Date: 2009-11-04 17:09:22 +0000 (Wed, 04 Nov 2009) Log Message: ----------- 2891804 - added some code that removes the html entities in the HTML files metadata (title, description and keywords), allowed for multi-word keywords separated by commas Modified Paths: -------------- aperture/trunk/core/helper/html/src/main/java/org/semanticdesktop/aperture/helper/html/HtmlParserUtil.java Modified: aperture/trunk/core/helper/html/src/main/java/org/semanticdesktop/aperture/helper/html/HtmlParserUtil.java =================================================================== --- aperture/trunk/core/helper/html/src/main/java/org/semanticdesktop/aperture/helper/html/HtmlParserUtil.java 2009-11-04 15:02:44 UTC (rev 2113) +++ aperture/trunk/core/helper/html/src/main/java/org/semanticdesktop/aperture/helper/html/HtmlParserUtil.java 2009-11-04 17:09:22 UTC (rev 2114) @@ -266,17 +266,20 @@ metaTagName = metaTagName.toLowerCase(); if (metaTagName.equals("author")) { - author = metaTagContent; + author = resolveText(metaTagContent); } else if (metaTagName.equals("description")) { - description = metaTagContent; + description = resolveText(metaTagContent); } else if (metaTagName.equals("keywords")) { - StringTokenizer tokenizer = new StringTokenizer(metaTagContent, " ,\t", false); + // originally this tokenizer worked on commas, tabs and spaces, this proved + // insufficient because some websites contain multi-word keywords, that's + // why I changed it to single comma, during the work on issue number 2891804 + StringTokenizer tokenizer = new StringTokenizer(metaTagContent, ",", false); while (tokenizer.hasMoreTokens()) { String keyword = tokenizer.nextToken(); if (keyword != null) { - keywordBuffer.add(keyword); + keywordBuffer.add(resolveText(keyword)); } } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |