[Archive-access-cvs] archive-access/projects/nutch/src/java/org/archive/access/nutch NutchwaxSegment

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv13604/src/java/org/archive/access/nutch

Modified Files:
	NutchwaxSegmentMergeTool.java 
Log Message:

* src/java/org/archive/access/nutch/NutchwaxSegmentMergeTool.java 
    Added deduping that counts the collection name.


Index: NutchwaxSegmentMergeTool.java
===================================================================
RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/NutchwaxSegmentMergeTool.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** NutchwaxSegmentMergeTool.java	27 Oct 2005 16:09:52 -0000	1.3
--- NutchwaxSegmentMergeTool.java	31 Oct 2005 18:00:17 -0000	1.4
***************
*** 238,256 ****
          String name = sr.segmentDir.getName();
          FetcherOutput fo = new FetcherOutput();
          for (long i = 0; i < sr.size; i++) {
            try {
!             if (!sr.get(i, fo, null, null, null)) break;
  
              Document doc = new Document();
              
              // compute boost
!             float boost = IndexSegment.calculateBoost(fo.getFetchListEntry().getPage().getScore(),
                      scorePower, boostByLinkCount, fo.getAnchors().length);
              doc.add(new Field("sd", name + "|" + i, true, false, false));
!             doc.add(new Field("uh", MD5Hash.digest(fo.getUrl().toString()).toString(), true, true, false));
!             doc.add(new Field("ch", fo.getMD5Hash().toString(), true, true, false));
!             doc.add(new Field("time", DateField.timeToString(fo.getFetchDate()), true, false, false));
!             doc.add(new Field("score", boost + "", true, false, false));
!             doc.add(new Field("ul", fo.getUrl().toString().length() + "", true, false, false));
              iw.addDocument(doc);
              processedRecords++;
--- 238,269 ----
          String name = sr.segmentDir.getName();
          FetcherOutput fo = new FetcherOutput();
+         ParseData pd = new ParseData();
          for (long i = 0; i < sr.size; i++) {
            try {
!             if (!sr.get(i, fo, null, null, pd))
!                 break;
  
              Document doc = new Document();
              
              // compute boost
!             float boost = IndexSegment.calculateBoost(
!                     fo.getFetchListEntry().getPage().getScore(),
                      scorePower, boostByLinkCount, fo.getAnchors().length);
              doc.add(new Field("sd", name + "|" + i, true, false, false));
!             // doc.add(new Field("uh", 
!             // MD5Hash.digest(fo.getUrl().toString()).toString(), true, true, false));
!             // doc.add(new Field("ch", fo.getMD5Hash().toString(), 
!             // true, true, false));
!             doc.add(new Field("time", 
!                 DateField.timeToString(fo.getFetchDate()), true, false, false));
!             // doc.add(new Field("score", boost + "", true, false, false));
!             // doc.add(new Field("ul", fo.getUrl().toString().length() + "", true,
!             // false, false));
! 
!             // Hash up the content hash, the url itself and the collection name.
!             String hashStr = fo.getMD5Hash().toString() + fo.getUrl().toString() +
!                 pd.getMetadata().getProperty("collection");
!             doc.add(new Field("ucc", MD5Hash.digest(hashStr).toString(), true, true,
!                 false));
              iw.addDocument(doc);
              processedRecords++;
***************
*** 298,411 ****
        }
        iw.close();
!       LOG.info("* Optimizing index took " + (System.currentTimeMillis() - s1) + " ms");
!         LOG.info("* Skipping deduplicate step...");
! //      LOG.info("* Removing duplicate entries...");
! //      stage = SegmentMergeStatus.STAGE_DEDUP;
!         IndexReader ir = IndexReader.open(masterDir);
! //      int i = 0;
! //      long cnt = 0L;
! //      processedRecords = 0L;
! //      s1 = System.currentTimeMillis();
! //      delta = s1;
! //      TermEnum te = ir.terms();
! //      while(te.next()) {
! //        Term t = te.term();
! //        if (t == null) continue;
! //        if (!(t.field().equals("ch") || t.field().equals("uh"))) continue;
! //        cnt++;
! //        processedRecords = cnt / 2;
! //        if (cnt > 0 && (cnt % (LOG_STEP  * 2) == 0)) {
! //          LOG.info(" Processed " + processedRecords + " records (" +
! //                  (float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)");
! //          delta = System.currentTimeMillis();
! //        }
! //        // Enumerate all docs with the same URL hash or content hash
! //        TermDocs td = ir.termDocs(t);
! //        if (td == null) continue;
! //        if (t.field().equals("uh")) {
! //          // Keep only the latest version of the document with
! //          // the same url hash. Note: even if the content
! //          // hash is identical, other metadata may be different, so even
! //          // in this case it makes sense to keep the latest version.
! //          int id = -1;
! //          String time = null;
! //          Document doc = null;
! //          while (td.next()) {
! //            int docid = td.doc();
! //            if (!ir.isDeleted(docid)) {
! //              doc = ir.document(docid);
! //              if (time == null) {
! //                time = doc.get("time");
! //                id = docid;
! //                continue;
! //              }
! //              String dtime = doc.get("time");
! //              // "time" is a DateField, and can be compared lexicographically
! //              if (dtime.compareTo(time) > 0) {
! //                if (id != -1) {
! //                  ir.delete(id);
! //                }
! //                time = dtime;
! //                id = docid;
! //              } else {
! //                ir.delete(docid);
! //              }
! //            }
! //          }
! //        } else if (t.field().equals("ch")) {
! //          // Keep only the version of the document with
! //          // the highest score, and then with the shortest url.
! //          int id = -1;
! //          int ul = 0;
! //          float score = 0.0f;
! //          Document doc = null;
! //          while (td.next()) {
! //            int docid = td.doc();
! //            if (!ir.isDeleted(docid)) {
! //              doc = ir.document(docid);
! //              if (ul == 0) {
! //                try {
! //                  ul = Integer.parseInt(doc.get("ul"));
! //                  score = Float.parseFloat(doc.get("score"));
! //                } catch (Exception e) {};
! //                id = docid;
! //                continue;
! //              }
! //              int dul = 0;
! //              float dscore = 0.0f;
! //              try {
! //                dul = Integer.parseInt(doc.get("ul"));
! //                dscore = Float.parseFloat(doc.get("score"));
! //              } catch (Exception e) {};
! //              int cmp = Float.compare(dscore, score);
! //              if (cmp == 0) {
! //                // equal scores, select the one with shortest url
! //                if (dul < ul) {
! //                  if (id != -1) {
! //                    ir.delete(id);
! //                  }
! //                  ul = dul;
! //                  id = docid;
! //                } else {
! //                  ir.delete(docid);
! //                }
! //              } else if (cmp < 0) {
! //                ir.delete(docid);
! //              } else {
! //                if (id != -1) {
! //                  ir.delete(id);
! //                }
! //                ul = dul;
! //                id = docid;
! //              }
! //            }
! //          }
! //        }
! //      }
! //      //
! //      // keep the IndexReader open...
! //      //
! //      
! //      LOG.info("* Deduplicating took " + (System.currentTimeMillis() - s1) + " ms");
        stage = SegmentMergeStatus.STAGE_WRITING;
        processedRecords = 0L;
--- 311,375 ----
        }
        iw.close();
!       LOG.info("* Optimizing index took " + (System.currentTimeMillis() - s1) +
!             " ms");
!       LOG.info("* Dedupling based off hash of content-md5 + url + collection...");
!       stage = SegmentMergeStatus.STAGE_DEDUP;
!       IndexReader ir = IndexReader.open(masterDir);
!       int i = 0;
!       long cnt = 0L;
!       processedRecords = 0L;
!       s1 = System.currentTimeMillis();
!       delta = s1;
!       TermEnum te = ir.terms();
!       while(te.next()) {
!         Term t = te.term();
!         if (t == null) continue;
!         if (!(t.field().equals("ucc"))) continue;
!         cnt++;
!         processedRecords = cnt / 2;
!         if (cnt > 0 && (cnt % (LOG_STEP  * 2) == 0)) {
!           LOG.info(" Processed " + processedRecords + " records (" +
!             (float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) +
!             " rec/s)");
!           delta = System.currentTimeMillis();
!         }
!         // Enumerate all docs with the same URL + content + collection  hash.
!         TermDocs td = ir.termDocs(t);
!         if (td == null) continue;
!         if (t.field().equals("ucc")) {
!           // Keep only the latest version of the document with
!           // the same url + content + collection hash. 
!           int id = -1;
!           String time = null;
!           Document doc = null;
!           while (td.next()) {
!             int docid = td.doc();
!             if (!ir.isDeleted(docid)) {
!               doc = ir.document(docid);
!               if (time == null) {
!                 time = doc.get("time");
!                 id = docid;
!                 continue;
!               }
!               String dtime = doc.get("time");
!               // "time" is a DateField, and can be compared lexicographically
!               if (dtime.compareTo(time) > 0) {
!                 if (id != -1) {
!                   ir.delete(id);
!                 }
!                 time = dtime;
!                 id = docid;
!               } else {
!                 ir.delete(docid);
!               }
!             }
!           }
!         }
!       }
!       //
!       // keep the IndexReader open...
!       //
!       
!       LOG.info("* Deduplicating took " + (System.currentTimeMillis() - s1) + " ms");
        stage = SegmentMergeStatus.STAGE_WRITING;
        processedRecords = 0L;

[Archive-access-cvs] archive-access/projects/nutch/src/java/org/archive/access/nutch NutchwaxSegment

[Archive-access-cvs] archive-access/projects/nutch/src/java/org/archive/access/nutch NutchwaxSegmentMergeTool.java,1.3,1.4