From: <ap...@vh...> - 2006-06-07 22:02:11
|
Author: apevec Date: 2006-06-07 23:57:40 +0200 (Wed, 07 Jun 2006) New Revision: 1149 Added: trunk/ccm-ldn-aplaws/src/com/arsdigita/aplaws/AutoCategorisation.java Modified: trunk/ccm-core/pdl/com/arsdigita/categorization/Category.pdl trunk/ccm-core/sql/ccm-core/default/categorization/table-cat_object_category_map.sql trunk/ccm-core/sql/ccm-core/upgrade/oracle-se-6.2.0-6.3.1.sql trunk/ccm-core/sql/ccm-core/upgrade/postgres-6.2.0-6.3.1.sql trunk/ccm-ldn-aplaws/application.xml trunk/ccm-ldn-navigation/src/com/arsdigita/london/navigation/NavigationFileResolver.java Log: CLI auto-categorisation using Cintra's searchLight index NOTE: data model change, isAuto flag added to cat/obj mapping Modified: trunk/ccm-core/pdl/com/arsdigita/categorization/Category.pdl =================================================================== --- trunk/ccm-core/pdl/com/arsdigita/categorization/Category.pdl 2006-06-04 14:28:17 UTC (rev 1148) +++ trunk/ccm-core/pdl/com/arsdigita/categorization/Category.pdl 2006-06-07 21:57:40 UTC (rev 1149) @@ -67,6 +67,7 @@ to cat_object_category_map.category_id, join cat_object_category_map.object_id to acs_objects.object_id; + Boolean[0..1] isAuto = cat_object_category_map.auto_p CHAR(1); Boolean[0..1] isDefault = cat_object_category_map.default_p CHAR(1); Boolean[0..1] isIndex = cat_object_category_map.index_p CHAR(1); BigDecimal[0..1] sortKey = cat_object_category_map.sort_key; Modified: trunk/ccm-core/sql/ccm-core/default/categorization/table-cat_object_category_map.sql =================================================================== --- trunk/ccm-core/sql/ccm-core/default/categorization/table-cat_object_category_map.sql 2006-06-04 14:28:17 UTC (rev 1148) +++ trunk/ccm-core/sql/ccm-core/default/categorization/table-cat_object_category_map.sql 2006-06-07 21:57:40 UTC (rev 1149) @@ -32,6 +32,9 @@ index_p char(1) constraint cat_obj_map_index_p_ck check(index_p in ('0','1')), + auto_p char(1) default '0' + constraint cat_obj_map_auto_p_ck + check(auto_p in ('0','1')), sort_key integer, constraint cat_obj_cat_map_ckone check(not category_id = object_id), Modified: trunk/ccm-core/sql/ccm-core/upgrade/oracle-se-6.2.0-6.3.1.sql =================================================================== --- trunk/ccm-core/sql/ccm-core/upgrade/oracle-se-6.2.0-6.3.1.sql 2006-06-04 14:28:17 UTC (rev 1148) +++ trunk/ccm-core/sql/ccm-core/upgrade/oracle-se-6.2.0-6.3.1.sql 2006-06-07 21:57:40 UTC (rev 1149) @@ -1 +1,2 @@ @@ ../default/upgrade/6.2.0-6.3.1/preferred-categories.sql +@@ ../default/upgrade/6.2.0-6.3.1/auto-categorization.sql Modified: trunk/ccm-core/sql/ccm-core/upgrade/postgres-6.2.0-6.3.1.sql =================================================================== --- trunk/ccm-core/sql/ccm-core/upgrade/postgres-6.2.0-6.3.1.sql 2006-06-04 14:28:17 UTC (rev 1148) +++ trunk/ccm-core/sql/ccm-core/upgrade/postgres-6.2.0-6.3.1.sql 2006-06-07 21:57:40 UTC (rev 1149) @@ -1 +1,7 @@ +begin; \i ../default/upgrade/6.2.0-6.3.1/preferred-categories.sql +\i ../default/upgrade/6.2.0-6.3.1/auto-categorization.sql +create or replace function last_day(date) returns date as 'select +cast(date_trunc(''month'', $1) + ''1 month''::interval as date) - 1' +language sql; +commit; Modified: trunk/ccm-ldn-aplaws/application.xml =================================================================== --- trunk/ccm-ldn-aplaws/application.xml 2006-06-04 14:28:17 UTC (rev 1148) +++ trunk/ccm-ldn-aplaws/application.xml 2006-06-07 21:57:40 UTC (rev 1149) @@ -11,6 +11,7 @@ <ccm:requires name="ccm-ldn-subsite" version="6.2.0" relation="ge"/> <ccm:requires name="ccm-ldn-portal" version="6.2.0" relation="ge"/> <ccm:requires name="ccm-ldn-terms" version="6.2.0" relation="ge"/> + <ccm:requires name="ccm-ldn-dublin" version="6.2.0" relation="ge"/> <ccm:requires name="ccm-ldn-navigation" version="6.2.0" relation="ge"/> <ccm:requires name="ccm-cms-types-article" version="6.2.0" relation="ge"/> </ccm:dependencies> Added: trunk/ccm-ldn-aplaws/src/com/arsdigita/aplaws/AutoCategorisation.java =================================================================== --- trunk/ccm-ldn-aplaws/src/com/arsdigita/aplaws/AutoCategorisation.java 2006-06-04 14:28:17 UTC (rev 1148) +++ trunk/ccm-ldn-aplaws/src/com/arsdigita/aplaws/AutoCategorisation.java 2006-06-07 21:57:40 UTC (rev 1149) @@ -0,0 +1,472 @@ +package com.arsdigita.aplaws; + +import java.io.FileReader; +import java.io.IOException; +import java.math.BigDecimal; +import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.StringTokenizer; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.OptionBuilder; +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import com.arsdigita.categorization.Category; +import com.arsdigita.cms.ContentItem; +import com.arsdigita.cms.ContentPage; +import com.arsdigita.cms.ContentSection; +import com.arsdigita.cms.ContentSectionCollection; +import com.arsdigita.cms.ContentSectionConfig; +import com.arsdigita.cms.dispatcher.ItemResolver; +import com.arsdigita.domain.DataObjectNotFoundException; +import com.arsdigita.domain.DomainServiceInterfaceExposer; +import com.arsdigita.london.cms.dublin.DublinCoreItem; +import com.arsdigita.london.navigation.Navigation; +import com.arsdigita.london.navigation.NavigationFileResolver; +import com.arsdigita.london.terms.Domain; +import com.arsdigita.london.terms.Term; +import com.arsdigita.london.util.Program; +import com.arsdigita.persistence.DataAssociation; +import com.arsdigita.persistence.DataAssociationCursor; +import com.arsdigita.persistence.DataCollection; +import com.arsdigita.persistence.DataObject; +import com.arsdigita.persistence.Session; +import com.arsdigita.persistence.SessionManager; +import com.arsdigita.web.Application; +/** + * Command line tool to automatically assign terms. + * + * Input is a MASmedia Searchlight Indexer report XML file. + * This service is available for registered users + * at http://demo.masprovider.com/searchLight/ + * + * @author ap...@re... + */ +public class AutoCategorisation extends Program { + + private static final String CCM_PREFIX = "/ccm/"; + + private SAXParser parser; + + public AutoCategorisation() { + super("AutoCategorisation", "1.0", "FILENAME(Searchlight XML report) SCORE(minimal term score)"); + getOptions().addOption + (OptionBuilder + .hasArg(false) + .withLongOpt("test") + .withDescription("Test only") + .create('t')); + + try { + parser = SAXParserFactory.newInstance().newSAXParser(); + } catch (ParserConfigurationException pce) { + throw new RuntimeException("SAX parser configuration error", pce); + } catch (SAXException se) { + throw new RuntimeException("SAX parser error", se); + } + } + + protected void doRun(CommandLine cmdLine) { + String[] args = cmdLine.getArgs(); + if (args.length == 2) { + String filename = args[0]; + int minScore = Integer.parseInt(args[1]); + try { + boolean persistChanges = ! cmdLine.hasOption('t'); + parser.parse(new InputSource(new FileReader(filename)), + new SearchlightHandler(minScore, isDebug(), isVerbose(), persistChanges)); + } catch (SAXException ex) { + throw new RuntimeException(ex); + } catch (IOException ex) { + throw new RuntimeException(ex); + } + } else { + help(System.err); + System.exit(1); + } + } + + public static void main(String[] args) { + new AutoCategorisation().run(args); + } + + /** + * Parse the XML site report from MASmedia Searchlight Indexer. + * + * @author apevec + * + */ + private static class SearchlightHandler extends DefaultHandler { + + boolean isDebug; + boolean isVerbose; + boolean persistChanges; + StringBuffer buffer; + String urlid; + String url; + ContentItem item; + Term term; + int score = 0; + Collection terms = new LinkedList(); + Collection keywords = new HashSet(); + boolean isKeyword; + int minScore; + + public SearchlightHandler(int minScore, boolean isDebug, boolean isVerbose, boolean persistChanges) { + this.minScore = minScore; + this.isDebug = isDebug; + this.isVerbose = isVerbose; + this.persistChanges = persistChanges; + } + + public void startDocument() throws SAXException { + if (isDebug) { + out("startDoc"); + } + } + + public void characters(char[] ch, int start, int len) + throws SAXException { + for (int i = 0; i < len; i++) { + buffer.append(ch[start + i]); + } + } + + public void startElement(String uri, String localName, String qName, + Attributes attributes) throws SAXException { + if (isDebug) { + out("startElement " + qName); + } + buffer = new StringBuffer(); + if ("url".equals(qName)) { + urlid = null; + url = null; + item = null; + term = null; + } else if ("terms".equals(qName)) { + terms.clear(); + score = 0; + } else if ("term".equals(qName)) { + // <term score="40" thesarus="IPSV-EX" id="9586">Travel</term> + if (item != null) { + score = Integer.parseInt(attributes.getValue("score")); + String domainKey = attributes.getValue("thesarus"); + Integer uniqueID = Integer.valueOf(attributes + .getValue("id")); + Domain domain = null; + try { + domain = Domain.retrieve(domainKey); + } catch (DataObjectNotFoundException donfe) { + if (isVerbose) { + out("domain not found: " + domainKey); + } + } + // take IPSV and LGCL only + if (domain != null && ("IPSV".equals(domainKey) || "LGCL".equals(domainKey))) { + try { + term = domain.getTerm(uniqueID); + } catch (DataObjectNotFoundException donfe) { + if (isVerbose) { + out("term not found: " + domainKey + '/' + uniqueID); + } + } + } + } + } else if ("unformated_data".equals(qName)) { + keywords.clear(); + } else if ("keyword_data".equals(qName)) { + String scheme = attributes.getValue("scheme"); + if ("IPSV".equals(scheme) || "LGCL".equals(scheme)) { + isKeyword = true; + } else { + isKeyword = false; + } + } // if qName + } +/* <tags> + <terms> + <term score="40" thesarus="IPSV" id="6809">Information services</term> + <term score="40" thesarus="IPSV" id="5546">Library and information services</term> + </terms> + <category><![CDATA[<meta name="eGMS.subject.category" scheme="IPSV" content="Library and information services" />]]></category> + <keyword><![CDATA[<meta name="eGMS.subject.keyword" scheme="IPSV" content="Information services" />]]></keyword> + <unformated_data> + <keyword_data scheme="IPSV-EX">Citizens Advice Bureaux; Citizen's Advice Bureaux</keyword_data> + <keyword_data scheme="IPSV">Citizens Advice Bureaux; Citizen's Advice +Bureaux; Advice centres; Parliament (European)</keyword_data> + <category_data scheme="IPSV">Citizens Advice Bureaux; Citizen's Advice Bureaux; Advice agencies; European Parliament</category_data> + </unformated_data> + </tags> + */ + public void endElement(String uri, String localName, String qName) + throws SAXException { + if (isDebug) { + out("endElement " + qName); + } + if ("urlid".equals(qName)) { + urlid = buffer.toString(); + } else if ("urladdress".equals(qName)) { + url = buffer.toString(); + // resolve url to the item + // supported are Navigation and ContentSection URLs + int ccmPrefix = url.indexOf(CCM_PREFIX); + if (ccmPrefix > -1) { + int appBegin = ccmPrefix + 5; + int appEnd = url.indexOf('/', appBegin); + if (appEnd > appBegin) { + String appURL = url.substring(appBegin, appEnd); + Application app = Application + .retrieveApplicationForPath('/' + appURL + '/'); + if (app != null) { + String appType = app.getApplicationType() + .getApplicationObjectType(); + if (ContentSection.BASE_DATA_OBJECT_TYPE + .equals(appType)) { + // a Content Section URL detected, resolving + // item path + ContentSection cs = (ContentSection) app; + ItemResolver resolver = cs.getItemResolver(); + if (resolver != null) { + item = resolver.getItem(cs, url + .substring(appEnd), + ContentItem.LIVE); + if (item != null) { + item = item.getDraftVersion(); + } else { + out("url not found: " + url); + } + } else { + if (isVerbose) { + out("invalid configuration: CS without resolver"); + } + } + } else if (Navigation.BASE_DATA_OBJECT_TYPE + .equals(appType)) { + // a Navigation URL detected, resolving category + Navigation nav = (Navigation) app; + Category cat = null; + // categoryID= + int catBegin = url.indexOf("categoryID=", + appEnd); + if (catBegin > appEnd) { + BigDecimal catID = new BigDecimal(url + .substring(catBegin + 11)); + cat = new Category(catID); + } else { + // named cat path, resolve using default + // context + // XXX subsites? + Category root = Category.getRootForObject( + nav, null); + Category[] cats = NavigationFileResolver + .resolveCategory(root, url + .substring(appEnd)); + if (cats != null && cats.length != 0) { + cat = cats[cats.length - 1]; + } else { + if (isVerbose) { + System.out + .println("category path not found: " + + url); + } + } + } + // category index item + item = (ContentItem) cat.getIndexObject(); + } else { + if (isVerbose) { + System.out + .println("unsupported application: " + appType + " at " + + url); + } + } + } else { + if (isVerbose) { + out("application not found: " + url); + } + } + } else { + if (isVerbose) { + out("unsupported CCM url: " + url); + } + } + } else { + if (isVerbose) { + out("unsupported url: " + url); + } + } + } else if ("terms".equals(qName)) { + if (item != null) { + Collection manualCategories = new HashSet(); + Collection oldAutoCategories = new HashSet(); + Collection newAutoCategories = new LinkedList(); + DataAssociationCursor cursor = ((DataAssociation)DomainServiceInterfaceExposer + .get(item, "categories")).cursor(); + // cat_object_category_map.auto_p + // cursor.addEqualsFilter("link.isAuto", Boolean.FALSE); + while (cursor.next()) { + Object categoryID = cursor.get("id"); + Boolean isAuto = (Boolean) cursor.getLinkProperty("isAuto"); + if (isAuto.booleanValue()) { + oldAutoCategories.add(categoryID); + } else { + manualCategories.add(categoryID); + } + } + // assign all new auto-derived terms, unless it's already assigned + // prefer manual IPSV/LGCL over auto IPSV/LGCL, deriving GCL, LGSL, LGDL + // check existing manual terms + DataCollection dc = SessionManager.getSession().retrieve(Term.BASE_DATA_OBJECT_TYPE); + dc.addEqualsFilter("model.id", manualCategories); + dc.addFilter("domain.key IN ('IPSV','LGCL')"); + boolean foundIPSVLGCL = false; + while (dc.next()) { + foundIPSVLGCL = true; + if (isDebug) { + out("manual IPSV/LGCL found"); + } + } + if (!foundIPSVLGCL) { + for (Iterator iter=terms.iterator();iter.hasNext();) { + Term t = (Term) iter.next(); + BigDecimal categoryID = term.getModel().getID(); + if ( !manualCategories.contains(categoryID)) { + if (!oldAutoCategories.contains(categoryID)) { + if (persistChanges) { + t.addObject(item); + } + newAutoCategories.add(categoryID); // to be marked isAuto + if (isVerbose) { + out("ASSIGN " + term + " to " + + item); + } + } else { + oldAutoCategories.remove(categoryID); + if (isVerbose) { + out("skip automatically assigned "+term); + } + } + } else if (isVerbose){ + out("skip manually assigned "+term); + } + } + // cleanup old auto-assigned terms + for (Iterator iter=oldAutoCategories.iterator(); iter.hasNext(); ) { + Category category = new Category((BigDecimal) iter.next()); + if (persistChanges) { + category.removeChild(item); + } + if (isDebug){ + out("removing oldAuto "+category+" from "+item); + } + } + cursor = ((DataAssociation)DomainServiceInterfaceExposer + .get(item, "categories")).cursor(); + cursor.addEqualsFilter("id", newAutoCategories); + while (cursor.next()) { + Object categoryID = cursor.get("id"); + if (persistChanges) { + DataObject link = cursor.getLink(); + link.set("isAuto",Boolean.TRUE); + } + if (isDebug) { + out("isAuto=TRUE for new categoryID="+categoryID+"/"+item); + } + } + } + // TODO derive other domains from assigned terms using mappings + } else { + if (isVerbose) { + System.out + .println("skip terms, item not found"); + } + } + } else if ("term".equals(qName)) { + // sanity check, compare term name found in MASmedia report + // and what is loaded in the database + String termNameFromXML = buffer.toString(); + if (term != null) { + if (termNameFromXML.equals(term.getName())) { + if (score >= minScore) { + // defer assigning of the term till </terms> + terms.add(term); + } else if (isVerbose) { + out("low score "+score+" for "+term.getName()+" "+term.getUniqueID()); + } + } else if (isVerbose) { + System.out + .println("term names differ XML:" + + termNameFromXML + " DB:" + + term.getName()); + } + term = null; + } else if (isVerbose) { + out("skipping term " + + termNameFromXML); + } + keywords.clear(); + } else if ("keyword_data".equals(qName)) { + if (isKeyword) { + // store all keywords separately + StringTokenizer tok = new StringTokenizer(buffer.toString(), ";"); + while (tok.hasMoreTokens()) { + keywords.add(tok.nextToken().trim()); + } + } + } else if ("unformated_data".equals(qName)) { + if (item != null && item instanceof ContentPage) { + ContentPage pageItem = (ContentPage) item; + DublinCoreItem dcItem = DublinCoreItem.findByOwner(pageItem); + // append into dcItem.getKeywords() + // NOTE: "DC keywords" metadata is stored as a string, cannot tell which keywords are auto. + // To support that, datamodel change to 1:N mapping table would be required. + StringTokenizer tok = new StringTokenizer(dcItem.getKeywords(), ";"); + // merge old "DC keywords" into set of new ones + while (tok.hasMoreTokens()) { + keywords.add(tok.nextToken().trim()); + } + StringBuffer buf = new StringBuffer(); + // reconstruct "DC keywords" and store them + Iterator i=keywords.iterator(); + if (i.hasNext()) { + buf.append(i.next()); + } + for (; i.hasNext();) { + buf.append(' ').append(';').append(i.next()); + } + String keywords = buf.toString(); + if (isVerbose) { + out("ASSIGN keywords \""+keywords+"\""); + } + if (persistChanges) { + dcItem.setKeywords(keywords); + } + } + } // if qName + } + + public void endDocument() throws SAXException { + if (isDebug) { + out("endDoc"); + } + } + + private void out(String line) { + System.out.println(line); + } + + private void err(String line) { + System.err.println(line); + } + } +} Modified: trunk/ccm-ldn-navigation/src/com/arsdigita/london/navigation/NavigationFileResolver.java =================================================================== --- trunk/ccm-ldn-navigation/src/com/arsdigita/london/navigation/NavigationFileResolver.java 2006-06-04 14:28:17 UTC (rev 1148) +++ trunk/ccm-ldn-navigation/src/com/arsdigita/london/navigation/NavigationFileResolver.java 2006-06-07 21:57:40 UTC (rev 1149) @@ -241,7 +241,7 @@ return root; } - private Category[] resolveCategory(Category root, + public static Category[] resolveCategory(Category root, String path) { String[] bits = StringUtils.split(path, '/'); |