From: <ap...@vh...> - 2006-06-10 20:38:03
|
Author: apevec Date: 2006-06-10 22:33:39 +0200 (Sat, 10 Jun 2006) New Revision: 1155 Modified: trunk/ccm-ldn-aplaws/src/com/arsdigita/aplaws/AutoCategorisation.java Log: refactoring after Cintra feedback: term@id is *NOT* Term.uniqueID - retrieve terms by name from unformated_data/category_data Modified: trunk/ccm-ldn-aplaws/src/com/arsdigita/aplaws/AutoCategorisation.java =================================================================== --- trunk/ccm-ldn-aplaws/src/com/arsdigita/aplaws/AutoCategorisation.java 2006-06-10 13:31:10 UTC (rev 1154) +++ trunk/ccm-ldn-aplaws/src/com/arsdigita/aplaws/AutoCategorisation.java 2006-06-10 20:33:39 UTC (rev 1155) @@ -29,6 +29,7 @@ import com.arsdigita.cms.ContentSectionConfig; import com.arsdigita.cms.dispatcher.ItemResolver; import com.arsdigita.domain.DataObjectNotFoundException; +import com.arsdigita.domain.DomainCollection; import com.arsdigita.domain.DomainServiceInterfaceExposer; import com.arsdigita.london.cms.dublin.DublinCoreItem; import com.arsdigita.london.navigation.Navigation; @@ -59,7 +60,7 @@ private SAXParser parser; public AutoCategorisation() { - super("AutoCategorisation", "1.0", "FILENAME(Searchlight XML report) SCORE(minimal term score)"); + super("AutoCategorisation", "1.0", "FILENAME(Searchlight XML report)"); getOptions().addOption (OptionBuilder .hasArg(false) @@ -78,13 +79,12 @@ protected void doRun(CommandLine cmdLine) { String[] args = cmdLine.getArgs(); - if (args.length == 2) { + if (args.length == 1) { String filename = args[0]; - int minScore = Integer.parseInt(args[1]); try { boolean persistChanges = ! cmdLine.hasOption('t'); parser.parse(new InputSource(new FileReader(filename)), - new SearchlightHandler(minScore, isDebug(), isVerbose(), persistChanges)); + new SearchlightHandler(isDebug(), isVerbose(), persistChanges)); } catch (SAXException ex) { throw new RuntimeException(ex); } catch (IOException ex) { @@ -115,15 +115,11 @@ String urlid; String url; ContentItem item; - Term term; - int score = 0; - Collection terms = new LinkedList(); + String scheme; Collection keywords = new HashSet(); - boolean isKeyword; - int minScore; + Collection terms = new HashSet(); - public SearchlightHandler(int minScore, boolean isDebug, boolean isVerbose, boolean persistChanges) { - this.minScore = minScore; + public SearchlightHandler(boolean isDebug, boolean isVerbose, boolean persistChanges) { this.isDebug = isDebug; this.isVerbose = isVerbose; this.persistChanges = persistChanges; @@ -152,61 +148,37 @@ urlid = null; url = null; item = null; - term = null; - } else if ("terms".equals(qName)) { - terms.clear(); - score = 0; - } else if ("term".equals(qName)) { - // <term score="40" thesarus="IPSV-EX" id="9586">Travel</term> - if (item != null) { - score = Integer.parseInt(attributes.getValue("score")); - String domainKey = attributes.getValue("thesarus"); - Integer uniqueID = Integer.valueOf(attributes - .getValue("id")); - Domain domain = null; - try { - domain = Domain.retrieve(domainKey); - } catch (DataObjectNotFoundException donfe) { - if (isVerbose) { - out("domain not found: " + domainKey); - } - } - // take IPSV and LGCL only - if (domain != null && ("IPSV".equals(domainKey) || "LGCL".equals(domainKey))) { - try { - term = domain.getTerm(uniqueID); - } catch (DataObjectNotFoundException donfe) { - if (isVerbose) { - out("term not found: " + domainKey + '/' + uniqueID); - } - } - } - } } else if ("unformated_data".equals(qName)) { keywords.clear(); + terms.clear(); + scheme = null; } else if ("keyword_data".equals(qName)) { - String scheme = attributes.getValue("scheme"); - if ("IPSV".equals(scheme) || "LGCL".equals(scheme)) { - isKeyword = true; - } else { - isKeyword = false; - } + scheme = attributes.getValue("scheme"); + } else if ("category_data".equals(qName)) { + scheme = attributes.getValue("scheme"); } // if qName } -/* <tags> + // XXX term@id is *NOT* Term.uniqueID, use category_date and retrieve terms by name +/* example from CAMDEN.xml + * <tags> <terms> - <term score="40" thesarus="IPSV" id="6809">Information services</term> - <term score="40" thesarus="IPSV" id="5546">Library and information services</term> + <term score="40" thesarus="LGTL" id="9087">A to Z</term> + <term score="6" thesarus="LGCS" id="8030">Development control</term> + <term score="6" thesarus="LGSL" id="10213">Development control</term> + <term score="6" thesarus="IPSV" id="12738">Development control</term> + <term score="6" thesarus="IPSV" id="11440">Domestic violence</term> + <term score="6" thesarus="IPSV" id="11695">Planning (town and country)</term> </terms> - <category><![CDATA[<meta name="eGMS.subject.category" scheme="IPSV" content="Library and information services" />]]></category> - <keyword><![CDATA[<meta name="eGMS.subject.keyword" scheme="IPSV" content="Information services" />]]></keyword> + <category><![CDATA[<meta name="eGMS.subject.category" scheme="LGCS" content="Development Control" />]]><![CDATA[<meta name="eGMS.subject.category" scheme="LGSL" content="Development Control" />]]><![CDATA[<meta name="eGMS.subject.category" scheme="IPSV" content="Domestic violence; Planning (town and country)" />]]></category> + <keyword><![CDATA[<meta name="eGMS.subject.keyword" scheme="LGTL" content="A to Z" />]]><![CDATA[<meta name="eGMS.subject.keyword" scheme="IPSV" content="Development Control" />]]></keyword> <unformated_data> - <keyword_data scheme="IPSV-EX">Citizens Advice Bureaux; Citizen's Advice Bureaux</keyword_data> - <keyword_data scheme="IPSV">Citizens Advice Bureaux; Citizen's Advice -Bureaux; Advice centres; Parliament (European)</keyword_data> - <category_data scheme="IPSV">Citizens Advice Bureaux; Citizen's Advice Bureaux; Advice agencies; European Parliament</category_data> + <keyword_data scheme="LGTL">A to Z</keyword_data> + <keyword_data scheme="IPSV">Development Control</keyword_data> + <category_data scheme="LGCS">Development Control</category_data> + <category_data scheme="LGSL">Development Control</category_data> + <category_data scheme="IPSV">Domestic violence; Planning (town and country)</category_data> </unformated_data> - </tags> + * </tags> */ public void endElement(String uri, String localName, String qName) throws SAXException { @@ -215,266 +187,293 @@ } if ("urlid".equals(qName)) { urlid = buffer.toString(); + if (isVerbose) { + out("urlid "+urlid); + } } else if ("urladdress".equals(qName)) { - url = buffer.toString(); - // resolve url to the item - // supported are Navigation and ContentSection URLs - int ccmPrefix = url.indexOf(CCM_PREFIX); - if (ccmPrefix > -1) { - int appBegin = ccmPrefix + 5; - int appEnd = url.indexOf('/', appBegin); - if (appEnd > appBegin) { - String appURL = url.substring(appBegin, appEnd); - Application app = Application - .retrieveApplicationForPath('/' + appURL + '/'); - if (app != null) { - String appType = app.getApplicationType() - .getApplicationObjectType(); - if (ContentSection.BASE_DATA_OBJECT_TYPE - .equals(appType)) { - // a Content Section URL detected, resolving - // item path - ContentSection cs = (ContentSection) app; - ItemResolver resolver = cs.getItemResolver(); - if (resolver != null) { - item = resolver.getItem(cs, url - .substring(appEnd), - ContentItem.LIVE); - if (item != null) { - item = item.getDraftVersion(); - } else { - out("url not found: " + url); - } - } else { - if (isVerbose) { - out("invalid configuration: CS without resolver"); - } + findItem(); + if (item != null && isVerbose) { + out("item " + item + " at " + url); + } + } else if ("keyword_data".equals(qName)) { + // merge all keywords + StringTokenizer tok = new StringTokenizer(buffer.toString(), + ";"); + while (tok.hasMoreTokens()) { + keywords.add(tok.nextToken().trim()); + } + } else if ("category_data".equals(qName)) { + // use only IPSV terms + if ("IPSV".equals(scheme)) { + StringTokenizer tok = new StringTokenizer( + buffer.toString(), ";"); + while (tok.hasMoreTokens()) { + Term term = findTerm(scheme, tok.nextToken().trim()); + if (term != null) { + terms.add(term); + } + } + } + } else if ("unformated_data".equals(qName)) { + assignKeywords(); + assignTerms(); + } // if qName + } + + public void endDocument() throws SAXException { + if (isDebug) { + out("endDoc"); + } + } + + private void out(String line) { + System.out.println(line); + } + + private void err(String line) { + System.err.println(line); + } + + private void findItem() { + url = buffer.toString(); + // resolve url to the item + // supported are Navigation and ContentSection URLs + int ccmPrefix = url.indexOf(CCM_PREFIX); + if (ccmPrefix > -1) { + int appBegin = ccmPrefix + 5; + int appEnd = url.indexOf('/', appBegin); + if (appEnd > appBegin) { + String appURL = url.substring(appBegin, appEnd); + Application app = Application + .retrieveApplicationForPath('/' + appURL + '/'); + if (app != null) { + String appType = app.getApplicationType() + .getApplicationObjectType(); + if (ContentSection.BASE_DATA_OBJECT_TYPE + .equals(appType)) { + // a Content Section URL detected, resolving + // item path + ContentSection cs = (ContentSection) app; + ItemResolver resolver = cs.getItemResolver(); + if (resolver != null) { + int queryBegin = url.indexOf('?', appEnd); + if (queryBegin > appEnd) { + // MPA URLs can have ?page=N which confuses c.a.cms.d.MLIR + url = url.substring(0,queryBegin); } - } else if (Navigation.BASE_DATA_OBJECT_TYPE - .equals(appType)) { - // a Navigation URL detected, resolving category - Navigation nav = (Navigation) app; - Category cat = null; - // categoryID= - int catBegin = url.indexOf("categoryID=", - appEnd); - if (catBegin > appEnd) { - BigDecimal catID = new BigDecimal(url - .substring(catBegin + 11)); - cat = new Category(catID); + item = resolver.getItem(cs, url + .substring(appEnd), + ContentItem.LIVE); + if (item != null) { + item = item.getDraftVersion(); } else { - // named cat path, resolve using default - // context - // XXX subsites? - Category root = Category.getRootForObject( - nav, null); - Category[] cats = NavigationFileResolver - .resolveCategory(root, url - .substring(appEnd)); - if (cats != null && cats.length != 0) { - cat = cats[cats.length - 1]; - } else { - if (isVerbose) { - out("category path not found: "+url); - } - } + out("live item not found for " + url); } - // category index item - if (cat != null) { - item = (ContentItem) cat.getIndexObject(); - } } else { if (isVerbose) { - out("unsupported application: " + appType + " at " + url); + out("ContentSection without resolver"); } } + } else if (Navigation.BASE_DATA_OBJECT_TYPE + .equals(appType)) { + // a Navigation URL detected, resolving category + Navigation nav = (Navigation) app; + Category cat = null; + // categoryID= + int catBegin = url.indexOf("categoryID=", + appEnd); + if (catBegin > appEnd) { + BigDecimal catID = new BigDecimal(url + .substring(catBegin + 11)); + cat = new Category(catID); + } else { + // named cat path, resolve using + // default context XXX subsites? + Category root = Category.getRootForObject( + nav, null); + Category[] cats = NavigationFileResolver + .resolveCategory(root, url + .substring(appEnd)); + if (cats != null && cats.length != 0) { + cat = cats[cats.length - 1]; + } else { + if (isVerbose) { + out("category path not found " + url); + } + } + } + // category index item + if (cat != null) { + item = (ContentItem) cat.getIndexObject(); + } } else { if (isVerbose) { - out("application not found: " + url); + out("unsupported application " + appType + " at " + url); } } } else { if (isVerbose) { - out("unsupported CCM url: " + url); + out("application not found " + url); } } } else { if (isVerbose) { - out("unsupported url: " + url); + out("unsupported CCM url " + url); } } - } else if ("terms".equals(qName)) { - if (item != null) { - Collection manualCategories = new HashSet(); - Collection oldAutoCategories = new HashSet(); - Collection newAutoCategories = new LinkedList(); - DataAssociationCursor cursor = ((DataAssociation)DomainServiceInterfaceExposer - .get(item, "categories")).cursor(); - // cat_object_category_map.auto_p - // cursor.addEqualsFilter("link.isAuto", Boolean.FALSE); - while (cursor.next()) { - Object categoryID = cursor.get("id"); - Boolean isAuto = (Boolean) cursor.getLinkProperty("isAuto"); - if (isAuto.booleanValue()) { - oldAutoCategories.add(categoryID); - } else { - manualCategories.add(categoryID); - } + } else { + if (isVerbose) { + out("unsupported url " + url); + } + } + } + + private void assignTerms() { + if (item != null) { + Collection manualCategories = new HashSet(); + Collection oldAutoCategories = new HashSet(); + Collection newAutoCategories = new LinkedList(); + DataAssociationCursor cursor = ((DataAssociation)DomainServiceInterfaceExposer + .get(item, "categories")).cursor(); + // cat_object_category_map.auto_p + // cursor.addEqualsFilter("link.isAuto", Boolean.FALSE); + while (cursor.next()) { + Object categoryID = cursor.get("id"); + Boolean isAuto = (Boolean) cursor.getLinkProperty("isAuto"); + if (isAuto.booleanValue()) { + oldAutoCategories.add(categoryID); + } else { + manualCategories.add(categoryID); } - // assign all new auto-derived terms, unless it's already assigned - // prefer manual IPSV/LGCL over auto IPSV/LGCL, deriving GCL, LGSL, LGDL - // check existing manual terms - boolean foundIPSVLGCL = false; - if (!manualCategories.isEmpty()) { - DataCollection dc = SessionManager.getSession().retrieve(Term.BASE_DATA_OBJECT_TYPE); - dc.addEqualsFilter("model.id", manualCategories); - dc.addFilter("domain.key IN ('IPSV','LGCL')"); - while (!foundIPSVLGCL && dc.next()) { - foundIPSVLGCL = true; - if (isVerbose) { - out("manual IPSV/LGCL found"); - } + } + // assign all new auto-derived terms, unless it's already assigned + // prefer manual IPSV/LGCL over auto IPSV, deriving GCL, LGSL, LGDL + // check existing manual terms + boolean foundIPSVLGCL = false; + if (!manualCategories.isEmpty()) { + DataCollection dc = SessionManager.getSession().retrieve(Term.BASE_DATA_OBJECT_TYPE); + dc.addEqualsFilter("model.id", manualCategories); + dc.addFilter("domain.key IN ('IPSV','LGCL')"); + while (!foundIPSVLGCL && dc.next()) { + foundIPSVLGCL = true; + if (isVerbose) { + out("manual IPSV/LGCL found"); } - dc.close(); } - if (!foundIPSVLGCL) { - for (Iterator iter=terms.iterator();iter.hasNext();) { - Term t = (Term) iter.next(); - BigDecimal categoryID = term.getModel().getID(); - if ( !manualCategories.contains(categoryID)) { - if (!oldAutoCategories.contains(categoryID)) { - if (persistChanges) { - t.addObject(item); - } - newAutoCategories.add(categoryID); // to be marked isAuto - if (isVerbose) { - out("ASSIGN " + term + " to " - + item); - } - } else { - oldAutoCategories.remove(categoryID); - if (isVerbose) { - out("skip automatically assigned "+term); - } - } - } else if (isVerbose){ - out("skip manually assigned "+term); - } - } - // cleanup old auto-assigned terms - for (Iterator iter=oldAutoCategories.iterator(); iter.hasNext(); ) { - Category category = new Category((BigDecimal) iter.next()); - if (persistChanges) { - category.removeChild(item); - } - if (isVerbose){ - out("removing oldAuto "+category+" from "+item); - } - } - if (!newAutoCategories.isEmpty()) { - cursor = ((DataAssociation)DomainServiceInterfaceExposer - .get(item, "categories")).cursor(); - cursor.addEqualsFilter("id", newAutoCategories); - while (cursor.next()) { - Object categoryID = cursor.get("id"); + dc.close(); + } + if (!foundIPSVLGCL) { + for (Iterator iter=terms.iterator();iter.hasNext();) { + Term t = (Term) iter.next(); + BigDecimal categoryID = t.getModel().getID(); + if ( !manualCategories.contains(categoryID)) { + if (!oldAutoCategories.contains(categoryID)) { if (persistChanges) { - DataObject link = cursor.getLink(); - link.set("isAuto",Boolean.TRUE); + t.addObject(item); } + newAutoCategories.add(categoryID); // to be marked isAuto if (isVerbose) { - out("isAuto=TRUE for new categoryID="+categoryID+"/"+item); + out("ASSIGN " + t + " to " + + item); } + } else { + oldAutoCategories.remove(categoryID); + if (isVerbose) { + out("already auto assigned "+t); + } } + } else if (isVerbose){ + out("already manually assigned "+t); } - } else { - // TODO derive other domains from assigned terms using mappings - if (isVerbose) { - out("derive from manual LGCL/IPSV"); + } + // cleanup old auto-assigned terms + for (Iterator iter=oldAutoCategories.iterator(); iter.hasNext(); ) { + Category category = new Category((BigDecimal) iter.next()); + if (persistChanges) { + category.removeChild(item); } + if (isVerbose){ + out("removing oldAuto "+category+" from "+item); + } } + if (!newAutoCategories.isEmpty()) { + cursor = ((DataAssociation)DomainServiceInterfaceExposer + .get(item, "categories")).cursor(); + cursor.addEqualsFilter("id", newAutoCategories); + while (cursor.next()) { + Object categoryID = cursor.get("id"); + if (persistChanges) { + DataObject link = cursor.getLink(); + link.set("isAuto",Boolean.TRUE); + } + if (isVerbose) { + out("isAuto=TRUE for new categoryID="+categoryID+"/"+item); + } + } + } } else { + // TODO derive other domains from assigned terms using mappings if (isVerbose) { - out("skip terms, item not found"); + out("derive from manual LGCL/IPSV"); } } - } else if ("term".equals(qName)) { - // sanity check, compare term name found in MASmedia report - // and what is loaded in the database - String termNameFromXML = buffer.toString(); - if (term != null) { - if (termNameFromXML.equals(term.getName())) { - if (score >= minScore) { - // defer assigning of the term till </terms> - terms.add(term); - } else if (isVerbose) { - out("low score "+score+" for "+term.getName()+" "+term.getUniqueID()); - } - } else if (isVerbose) { - out("term names differ XML:" - + termNameFromXML + " DB:" - + term.getName()); - } - term = null; - } else if (isVerbose) { - out("skipping term " - + termNameFromXML); - } - keywords.clear(); - } else if ("keyword_data".equals(qName)) { - if (isKeyword) { - // store all keywords separately - StringTokenizer tok = new StringTokenizer(buffer.toString(), ";"); + } + } + + private void assignKeywords() { + if (item != null) { + DublinCoreItem dcItem = DublinCoreItem.findByOwner(item); + // append into dcItem.getKeywords() + // NOTE: "DC keywords" metadata is stored as a string, cannot tell which keywords are auto. + // To support that, datamodel change to 1:N mapping table would be required. + String dcKeywords = dcItem.getKeywords(); + if (dcKeywords != null) { + StringTokenizer tok = new StringTokenizer(dcKeywords, ";"); + // merge old "DC keywords" into set of new ones while (tok.hasMoreTokens()) { keywords.add(tok.nextToken().trim()); } } - } else if ("unformated_data".equals(qName)) { - if (item != null && item instanceof ContentPage) { - ContentPage pageItem = (ContentPage) item; - DublinCoreItem dcItem = DublinCoreItem.findByOwner(pageItem); - // append into dcItem.getKeywords() - // NOTE: "DC keywords" metadata is stored as a string, cannot tell which keywords are auto. - // To support that, datamodel change to 1:N mapping table would be required. - String dcKeywords = dcItem.getKeywords(); - if (dcKeywords != null) { - StringTokenizer tok = new StringTokenizer(dcKeywords, ";"); - // merge old "DC keywords" into set of new ones - while (tok.hasMoreTokens()) { - keywords.add(tok.nextToken().trim()); - } - } - StringBuffer buf = new StringBuffer(); - // reconstruct "DC keywords" and store them - Iterator i=keywords.iterator(); - if (i.hasNext()) { - buf.append(i.next()); - } - for (; i.hasNext();) { - buf.append(' ').append(';').append(i.next()); - } - dcKeywords = buf.toString(); - if (isVerbose) { - out("ASSIGN DC keywords \""+dcKeywords+"\""); - } - if (persistChanges) { - dcItem.setKeywords(dcKeywords); - } + StringBuffer buf = new StringBuffer(); + // reconstruct "DC keywords" and store them + Iterator i=keywords.iterator(); + if (i.hasNext()) { + buf.append(i.next()); } - } // if qName - } - - public void endDocument() throws SAXException { - if (isDebug) { - out("endDoc"); + for (; i.hasNext();) { + buf.append(' ').append(';').append(i.next()); + } + dcKeywords = buf.toString(); + if (isVerbose) { + out("ASSIGN DC keywords \""+dcKeywords+"\""); + } + if (persistChanges) { + dcItem.setKeywords(dcKeywords); + } } } - private void out(String line) { - System.out.println(line); + private Term findTerm(String domainKey, String name) { + try { + Domain domain = Domain.retrieve(domainKey); + if (domain != null) { + DomainCollection terms = domain.getTerms(); + terms.addEqualsFilter(Term.NAME, name); + if (terms.next()) { + Term term = (Term) terms.getDomainObject(); + terms.close(); + return term; + } else if (isVerbose) { + out("term not found " + domainKey + '/' + name); + } + } + } catch (DataObjectNotFoundException donfe) { + if (isVerbose) { + out("domain not found " + domainKey); + } + } + return null; } - private void err(String line) { - System.err.println(line); - } } } |