From: Michael S. <sta...@us...> - 2005-10-31 18:00:26
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv13604/src/java/org/archive/access/nutch Modified Files: NutchwaxSegmentMergeTool.java Log Message: * src/java/org/archive/access/nutch/NutchwaxSegmentMergeTool.java Added deduping that counts the collection name. Index: NutchwaxSegmentMergeTool.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/NutchwaxSegmentMergeTool.java,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** NutchwaxSegmentMergeTool.java 27 Oct 2005 16:09:52 -0000 1.3 --- NutchwaxSegmentMergeTool.java 31 Oct 2005 18:00:17 -0000 1.4 *************** *** 238,256 **** String name = sr.segmentDir.getName(); FetcherOutput fo = new FetcherOutput(); for (long i = 0; i < sr.size; i++) { try { ! if (!sr.get(i, fo, null, null, null)) break; Document doc = new Document(); // compute boost ! float boost = IndexSegment.calculateBoost(fo.getFetchListEntry().getPage().getScore(), scorePower, boostByLinkCount, fo.getAnchors().length); doc.add(new Field("sd", name + "|" + i, true, false, false)); ! doc.add(new Field("uh", MD5Hash.digest(fo.getUrl().toString()).toString(), true, true, false)); ! doc.add(new Field("ch", fo.getMD5Hash().toString(), true, true, false)); ! doc.add(new Field("time", DateField.timeToString(fo.getFetchDate()), true, false, false)); ! doc.add(new Field("score", boost + "", true, false, false)); ! doc.add(new Field("ul", fo.getUrl().toString().length() + "", true, false, false)); iw.addDocument(doc); processedRecords++; --- 238,269 ---- String name = sr.segmentDir.getName(); FetcherOutput fo = new FetcherOutput(); + ParseData pd = new ParseData(); for (long i = 0; i < sr.size; i++) { try { ! if (!sr.get(i, fo, null, null, pd)) ! break; Document doc = new Document(); // compute boost ! float boost = IndexSegment.calculateBoost( ! fo.getFetchListEntry().getPage().getScore(), scorePower, boostByLinkCount, fo.getAnchors().length); doc.add(new Field("sd", name + "|" + i, true, false, false)); ! // doc.add(new Field("uh", ! // MD5Hash.digest(fo.getUrl().toString()).toString(), true, true, false)); ! // doc.add(new Field("ch", fo.getMD5Hash().toString(), ! // true, true, false)); ! doc.add(new Field("time", ! DateField.timeToString(fo.getFetchDate()), true, false, false)); ! // doc.add(new Field("score", boost + "", true, false, false)); ! // doc.add(new Field("ul", fo.getUrl().toString().length() + "", true, ! // false, false)); ! ! // Hash up the content hash, the url itself and the collection name. ! String hashStr = fo.getMD5Hash().toString() + fo.getUrl().toString() + ! pd.getMetadata().getProperty("collection"); ! doc.add(new Field("ucc", MD5Hash.digest(hashStr).toString(), true, true, ! false)); iw.addDocument(doc); processedRecords++; *************** *** 298,411 **** } iw.close(); ! LOG.info("* Optimizing index took " + (System.currentTimeMillis() - s1) + " ms"); ! LOG.info("* Skipping deduplicate step..."); ! // LOG.info("* Removing duplicate entries..."); ! // stage = SegmentMergeStatus.STAGE_DEDUP; ! IndexReader ir = IndexReader.open(masterDir); ! // int i = 0; ! // long cnt = 0L; ! // processedRecords = 0L; ! // s1 = System.currentTimeMillis(); ! // delta = s1; ! // TermEnum te = ir.terms(); ! // while(te.next()) { ! // Term t = te.term(); ! // if (t == null) continue; ! // if (!(t.field().equals("ch") || t.field().equals("uh"))) continue; ! // cnt++; ! // processedRecords = cnt / 2; ! // if (cnt > 0 && (cnt % (LOG_STEP * 2) == 0)) { ! // LOG.info(" Processed " + processedRecords + " records (" + ! // (float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)"); ! // delta = System.currentTimeMillis(); ! // } ! // // Enumerate all docs with the same URL hash or content hash ! // TermDocs td = ir.termDocs(t); ! // if (td == null) continue; ! // if (t.field().equals("uh")) { ! // // Keep only the latest version of the document with ! // // the same url hash. Note: even if the content ! // // hash is identical, other metadata may be different, so even ! // // in this case it makes sense to keep the latest version. ! // int id = -1; ! // String time = null; ! // Document doc = null; ! // while (td.next()) { ! // int docid = td.doc(); ! // if (!ir.isDeleted(docid)) { ! // doc = ir.document(docid); ! // if (time == null) { ! // time = doc.get("time"); ! // id = docid; ! // continue; ! // } ! // String dtime = doc.get("time"); ! // // "time" is a DateField, and can be compared lexicographically ! // if (dtime.compareTo(time) > 0) { ! // if (id != -1) { ! // ir.delete(id); ! // } ! // time = dtime; ! // id = docid; ! // } else { ! // ir.delete(docid); ! // } ! // } ! // } ! // } else if (t.field().equals("ch")) { ! // // Keep only the version of the document with ! // // the highest score, and then with the shortest url. ! // int id = -1; ! // int ul = 0; ! // float score = 0.0f; ! // Document doc = null; ! // while (td.next()) { ! // int docid = td.doc(); ! // if (!ir.isDeleted(docid)) { ! // doc = ir.document(docid); ! // if (ul == 0) { ! // try { ! // ul = Integer.parseInt(doc.get("ul")); ! // score = Float.parseFloat(doc.get("score")); ! // } catch (Exception e) {}; ! // id = docid; ! // continue; ! // } ! // int dul = 0; ! // float dscore = 0.0f; ! // try { ! // dul = Integer.parseInt(doc.get("ul")); ! // dscore = Float.parseFloat(doc.get("score")); ! // } catch (Exception e) {}; ! // int cmp = Float.compare(dscore, score); ! // if (cmp == 0) { ! // // equal scores, select the one with shortest url ! // if (dul < ul) { ! // if (id != -1) { ! // ir.delete(id); ! // } ! // ul = dul; ! // id = docid; ! // } else { ! // ir.delete(docid); ! // } ! // } else if (cmp < 0) { ! // ir.delete(docid); ! // } else { ! // if (id != -1) { ! // ir.delete(id); ! // } ! // ul = dul; ! // id = docid; ! // } ! // } ! // } ! // } ! // } ! // // ! // // keep the IndexReader open... ! // // ! // ! // LOG.info("* Deduplicating took " + (System.currentTimeMillis() - s1) + " ms"); stage = SegmentMergeStatus.STAGE_WRITING; processedRecords = 0L; --- 311,375 ---- } iw.close(); ! LOG.info("* Optimizing index took " + (System.currentTimeMillis() - s1) + ! " ms"); ! LOG.info("* Dedupling based off hash of content-md5 + url + collection..."); ! stage = SegmentMergeStatus.STAGE_DEDUP; ! IndexReader ir = IndexReader.open(masterDir); ! int i = 0; ! long cnt = 0L; ! processedRecords = 0L; ! s1 = System.currentTimeMillis(); ! delta = s1; ! TermEnum te = ir.terms(); ! while(te.next()) { ! Term t = te.term(); ! if (t == null) continue; ! if (!(t.field().equals("ucc"))) continue; ! cnt++; ! processedRecords = cnt / 2; ! if (cnt > 0 && (cnt % (LOG_STEP * 2) == 0)) { ! LOG.info(" Processed " + processedRecords + " records (" + ! (float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + ! " rec/s)"); ! delta = System.currentTimeMillis(); ! } ! // Enumerate all docs with the same URL + content + collection hash. ! TermDocs td = ir.termDocs(t); ! if (td == null) continue; ! if (t.field().equals("ucc")) { ! // Keep only the latest version of the document with ! // the same url + content + collection hash. ! int id = -1; ! String time = null; ! Document doc = null; ! while (td.next()) { ! int docid = td.doc(); ! if (!ir.isDeleted(docid)) { ! doc = ir.document(docid); ! if (time == null) { ! time = doc.get("time"); ! id = docid; ! continue; ! } ! String dtime = doc.get("time"); ! // "time" is a DateField, and can be compared lexicographically ! if (dtime.compareTo(time) > 0) { ! if (id != -1) { ! ir.delete(id); ! } ! time = dtime; ! id = docid; ! } else { ! ir.delete(docid); ! } ! } ! } ! } ! } ! // ! // keep the IndexReader open... ! // ! ! LOG.info("* Deduplicating took " + (System.currentTimeMillis() - s1) + " ms"); stage = SegmentMergeStatus.STAGE_WRITING; processedRecords = 0L; |