Commit [870b5d] Maximize Restore History

3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process

git-svn-id: http://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20

zdenop@gmail.com zdenop@gmail.com 2010-11-23

1 2 3 .. 26 > >> (Page 1 of 26)
added api/apitypes.h
added api/pageiterator.cpp
removed ccmain/blobcmp.cpp
changed ChangeLog
changed Makefile.am
changed Makefile.in
changed ReleaseNotes
changed api
changed api/Makefile.am
changed api/Makefile.in
changed api/baseapi.cpp
changed api/baseapi.h
changed api/tesseractmain.cpp
changed api/tesseractmain.h
changed ccmain
changed ccmain/Makefile.am
changed ccmain/Makefile.in
changed ccmain/adaptions.cpp
changed ccmain/applybox.cpp
copied ccmain/adaptions.h -> api/resultiterator.h
copied ccmain/ambigsrecog.cpp -> api/resultiterator.cpp
copied ccmain/applybox.h -> api/pageiterator.h
api/apitypes.h Diff Switch to side-by-side view
Loading...
api/pageiterator.cpp Diff Switch to side-by-side view
Loading...
ccmain/blobcmp.cpp
File was removed.
ChangeLog Diff Switch to side-by-side view
Loading...
Makefile.am Diff Switch to side-by-side view
Loading...
Makefile.in Diff Switch to side-by-side view
Loading...
ReleaseNotes Diff Switch to side-by-side view
Loading...
api
Directory.
api/Makefile.am Diff Switch to side-by-side view
Loading...
api/Makefile.in Diff Switch to side-by-side view
Loading...
api/baseapi.cpp Diff Switch to side-by-side view
Loading...
api/baseapi.h Diff Switch to side-by-side view
Loading...
api/tesseractmain.cpp Diff Switch to side-by-side view
Loading...
api/tesseractmain.h Diff Switch to side-by-side view
Loading...
ccmain
Directory.
ccmain/Makefile.am Diff Switch to side-by-side view
Loading...
ccmain/Makefile.in Diff Switch to side-by-side view
Loading...
ccmain/adaptions.cpp Diff Switch to side-by-side view
Loading...
ccmain/applybox.cpp Diff Switch to side-by-side view
Loading...
ccmain/adaptions.h to api/resultiterator.h
--- a/ccmain/adaptions.h
+++ b/api/resultiterator.h
@@ -1,89 +1,144 @@
-/**********************************************************************
- * File:        adaptions.h  (Formerly adaptions.h)
- * Description: Functions used to adapt to blobs already confidently
- *					identified
- * Author:		Chris Newton
- * Created:		Thu Oct  7 10:17:28 BST 1993
- *
- * (C) Copyright 1992, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
+///////////////////////////////////////////////////////////////////////
+// File:        resultiterator.h
+// Description: Iterator for tesseract results that avoids using tesseract
+//              internal data structures.
+// Author:      Ray Smith
+// Created:     Fri Feb 26 11:01:06 PST 2010
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
 
-#ifndef           ADAPTIONS_H
-#define           ADAPTIONS_H
+#ifndef TESSERACT_API_RESULTITERATOR_H__
+#define TESSERACT_API_RESULTITERATOR_H__
 
-#include          "charsample.h"
-#include          "charcut.h"
-#include          "notdll.h"
+#include "pageiterator.h"
 
-extern BOOL_VAR_H (tessedit_reject_ems, FALSE, "Reject all m's");
-extern BOOL_VAR_H (tessedit_reject_suspect_ems, FALSE, "Reject suspect m's");
-extern double_VAR_H (tessedit_cluster_t1, 0.20,
-"t1 threshold for clustering samples");
-extern double_VAR_H (tessedit_cluster_t2, 0.40,
-"t2 threshold for clustering samples");
-extern double_VAR_H (tessedit_cluster_t3, 0.12,
-"Extra threshold for clustering samples, only keep a new sample if best score greater than this value");
-extern double_VAR_H (tessedit_cluster_accept_fraction, 0.80,
-"Largest fraction of characters in cluster for it to be used for adaption");
-extern INT_VAR_H (tessedit_cluster_min_size, 3,
-"Smallest number of samples in a cluster for it to be used for adaption");
-extern BOOL_VAR_H (tessedit_cluster_debug, FALSE,
-"Generate and print debug information for adaption by clustering");
-extern BOOL_VAR_H (tessedit_use_best_sample, FALSE,
-"Use best sample from cluster when adapting");
-extern BOOL_VAR_H (tessedit_test_cluster_input, FALSE,
-"Set reject map to enable cluster input to be measured");
-extern BOOL_VAR_H (tessedit_matrix_match, TRUE, "Use matrix matcher");
-extern BOOL_VAR_H (tessedit_old_matrix_match, FALSE, "Use matrix matcher");
-extern BOOL_VAR_H (tessedit_mm_use_non_adaption_set, FALSE,
-"Don't try to adapt to characters on this list");
-extern STRING_VAR_H (tessedit_non_adaption_set, ",.;:'~@*",
-"Characters to be avoided when adapting");
-extern BOOL_VAR_H (tessedit_mm_adapt_using_prototypes, TRUE,
-"Use prototypes when adapting");
-extern BOOL_VAR_H (tessedit_mm_use_prototypes, TRUE,
-"Use prototypes as clusters are built");
-extern BOOL_VAR_H (tessedit_mm_use_rejmap, FALSE,
-"Adapt to characters using reject map");
-extern BOOL_VAR_H (tessedit_mm_all_rejects, FALSE,
-"Adapt to all characters using, matrix matcher");
-extern BOOL_VAR_H (tessedit_mm_only_match_same_char, FALSE,
-"Only match samples against clusters for the same character");
-extern BOOL_VAR_H (tessedit_process_rns, FALSE, "Handle m - rn ambigs");
-extern BOOL_VAR_H (tessedit_demo_adaption, FALSE,
-"Display cut images and matrix match for demo purposes");
-extern INT_VAR_H (tessedit_demo_word1, 62,
-"Word number of first word to display");
-extern INT_VAR_H (tessedit_demo_word2, 64,
-"Word number of second word to display");
-extern STRING_VAR_H (tessedit_demo_file, "academe",
-"Name of document containing demo words");
-extern BOOL_VAR_H(tessedit_adapt_to_char_fragments, TRUE,
-                  "Adapt to words that contain "
-                  " a character composed form fragments");
+class BLOB_CHOICE_IT;
 
-void print_em_stats(CHAR_SAMPLES_LIST *char_clusters,
-                    CHAR_SAMPLE_LIST *chars_waiting);
-                                 //lines of the image
-CHAR_SAMPLE *clip_sample(PIXROW *pixrow,
-                         IMAGELINE *imlines,
-                         TBOX pix_box,  //box of imlines extent
-                         BOOL8 white_on_black,
-                         char c);
-void display_cluster_prototypes(CHAR_SAMPLES_LIST *char_clusters);
-void reject_all_ems(WERD_RES *word);
-void reject_all_fullstops(WERD_RES *word);
-void reject_suspect_fullstops(WERD_RES *word);
-BOOL8 suspect_em(WERD_RES *word, inT16 index);
-BOOL8 suspect_fullstop(WERD_RES *word, inT16 i);
-#endif
+namespace tesseract {
+
+class Tesseract;
+
+// Class to iterate over tesseract results, providing access to all levels
+// of the page hierarchy, without including any tesseract headers or having
+// to handle any tesseract structures.
+// WARNING! This class points to data held within the TessBaseAPI class, and
+// therefore can only be used while the TessBaseAPI class still exists and
+// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+// DetectOS, or anything else that changes the internal PAGE_RES.
+// See apitypes.h for the definition of PageIteratorLevel.
+// See also base class PageIterator, which contains the bulk of the interface.
+// ResultIterator adds text-specific methods for access to OCR output.
+
+class ResultIterator : public PageIterator {
+  friend class ChoiceIterator;
+ public:
+  // page_res and tesseract come directly from the BaseAPI.
+  // The rectangle parameters are copied indirectly from the Thresholder,
+  // via the BaseAPI. They represent the coordinates of some rectangle in an
+  // original image (in top-left-origin coordinates) and therefore the top-left
+  // needs to be added to any output boxes in order to specify coordinates
+  // in the original image. See TessBaseAPI::SetRectangle.
+  // The scale and scaled_yres are in case the Thresholder scaled the image
+  // rectangle prior to thresholding. Any coordinates in tesseract's image
+  // must be divided by scale before adding (rect_left, rect_top).
+  // The scaled_yres indicates the effective resolution of the binary image
+  // that tesseract has been given by the Thresholder.
+  // After the constructor, Begin has already been called.
+  ResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
+                 int scale, int scaled_yres,
+                 int rect_left, int rect_top,
+                 int rect_width, int rect_height);
+  virtual ~ResultIterator();
+
+  // ResultIterators may be copied! This makes it possible to iterate over
+  // all the objects at a lower level, while maintaining an iterator to
+  // objects at a higher level. These constructors DO NOT CALL Begin, so
+  // iterations will continue from the location of src.
+  // TODO: For now the copy constructor and operator= only need the base class
+  // versions, but if new data members are added, don't forget to add them!
+
+  // ============= Moving around within the page ============.
+
+  // See PageIterator.
+
+  // ============= Accessing data ==============.
+
+  // Returns the null terminated UTF-8 encoded text string for the current
+  // object at the given level. Use delete [] to free after use.
+  char* GetUTF8Text(PageIteratorLevel level) const;
+
+  // Returns the mean confidence of the current object at the given level.
+  // The number should be interpreted as a percent probability. (0.0f-100.0f)
+  float Confidence(PageIteratorLevel level) const;
+
+  // ============= Functions that refer to words only ============.
+
+  // Returns the font attributes of the current word. If iterating at a higher
+  // level object than words, eg textlines, then this will return the
+  // attributes of the first word in that textline.
+  // The actual return value is a string representing a font name. It points
+  // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
+  // the iterator itself, ie rendered invalid by various members of
+  // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
+  // Pointsize is returned in printers points (1/72 inch.)
+  const char* WordFontAttributes(bool* is_bold,
+                                 bool* is_italic,
+                                 bool* is_underlined,
+                                 bool* is_monospace,
+                                 bool* is_serif,
+                                 int* pointsize,
+                                 int* font_id) const;
+
+  // Returns true if the current word was found in a dictionary.
+  bool WordIsFromDictionary() const;
+
+  // Returns true if the current word is numeric.
+  bool WordIsNumeric() const;
+};
+
+// Class to iterate over the classifier choices for a single RIL_SYMBOL.
+class ChoiceIterator {
+ public:
+  // Construction is from a ResultIterator that points to the symbol of
+  // interest. The ChoiceIterator allows a one-shot iteration over the
+  // choices for this symbol and after that is is useless.
+  explicit ChoiceIterator(const ResultIterator& result_it);
+  ~ChoiceIterator();
+
+  // Moves to the next choice for the symbol and returns false if there
+  // are none left.
+  bool Next();
+
+  // ============= Accessing data ==============.
+
+  // Returns the null terminated UTF-8 encoded text string for the current
+  // choice.
+  // NOTE: Unlike ResultIterator::GetUTF8Text, the return points to an
+  // internal structure and should NOT be delete[]ed to free after use.
+  const char* GetUTF8Text() const;
+
+  // Returns the confidence of the current choice.
+  // The number should be interpreted as a percent probability. (0.0f-100.0f)
+  float Confidence() const;
+
+ private:
+  // Pointer to the Tesseract object owned by the API.
+  Tesseract* tesseract_;
+  // Iterator over the blob choices.
+  BLOB_CHOICE_IT* choice_it_;
+};
+
+}  // namespace tesseract.
+
+#endif  // TESSERACT_API_RESULT_ITERATOR_H__
ccmain/ambigsrecog.cpp to api/resultiterator.cpp
--- a/ccmain/ambigsrecog.cpp
+++ b/api/resultiterator.cpp
@@ -1,11 +1,11 @@
 ///////////////////////////////////////////////////////////////////////
-// File:        genericvector.h
-// Description: Functions for producing classifications
-//              for the input to ambigstraining.
-// Author:      Daria Antonova
-// Created:     Mon Jun 23 11:26:43 PDT 2008
+// File:        resultiterator.cpp
+// Description: Iterator for tesseract results that avoids using tesseract
+//              internal data structures
+// Author:      Ray Smith
+// Created:     Fri Feb 26 14:32:09 PST 2010
 //
-// (C) Copyright 2007, Google Inc.
+// (C) Copyright 2010, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -18,162 +18,232 @@
 //
 ///////////////////////////////////////////////////////////////////////
 
-#include "ambigs.h"
-
-#include "applybox.h"
-#include "boxread.h"
-#include "control.h"
-#include "permute.h"
-#include "ratngs.h"
-#include "reject.h"
-#include "stopper.h"
+#include "resultiterator.h"
+#include "allheaders.h"
+#include "pageres.h"
 #include "tesseractclass.h"
 
 namespace tesseract {
 
-// Sets flags necessary for ambigs training mode.
-// Opens and returns the pointer to the output file.
-FILE *Tesseract::init_ambigs_training(const STRING &fname) {
-  permute_only_top = 1;                        // use only top choice permuter
-  tessedit_tess_adaption_mode.set_value(0);    // turn off adaption
-  tessedit_ok_mode.set_value(0);               // turn off context checking
-  tessedit_enable_doc_dict.set_value(0);       // turn off document dictionary
-  save_best_choices.set_value(1);              // save individual char choices
-  stopper_no_acceptable_choices.set_value(1);  // explore all segmentations
-  save_raw_choices.set_value(1);               // save raw choices
-
-  // Open ambigs output file.
-  STRING output_fname = fname;
-  const char *lastdot = strrchr(output_fname.string(), '.');
-  if (lastdot != NULL) {
-    output_fname[lastdot - output_fname.string()] = '\0';
-  }
-  output_fname += ".txt";
-  FILE *output_file;
-  if (!(output_file = fopen(output_fname.string(), "a+"))) {
-    CANTOPENFILE.error("ambigs_training", EXIT,
-                       "Can't open box file %s\n", output_fname.string());
-  }
-  return output_file;
-}
-
-// This function takes tif/box pair of files and runs recognition on the image,
-// while making sure that the word bounds that tesseract identified roughly
-// match to those specified by the input box file. For each word (ngram in a
-// single bounding box from the input box file) it outputs the ocred result,
-// the correct label, rating and certainty.
-void Tesseract::ambigs_training_segmented(const STRING &fname,
-                                          PAGE_RES *page_res,
-                                          volatile ETEXT_DESC *monitor,
-                                          FILE *output_file) {
-  STRING box_fname = fname;
-  const char *lastdot = strrchr(box_fname.string(), '.');
-  if (lastdot != NULL) {
-    box_fname[lastdot - box_fname.string()] = '\0';
-  }
-  box_fname += ".box";
-  FILE *box_file;
-  if (!(box_file = fopen(box_fname.string(), "r"))) {
-    CANTOPENFILE.error("ambigs_training", EXIT,
-                       "Can't open box file %s\n", box_fname.string());
-  }
-
-  static PAGE_RES_IT page_res_it;
-  page_res_it.page_res = page_res;
-  page_res_it.restart_page();
-  int x_min, y_min, x_max, y_max;
-  char label[UNICHAR_LEN * 10];
-
-  // Process all the words on this page.
-  while (page_res_it.word() != NULL &&
-         read_next_box(applybox_page, box_file, label,
-                       &x_min, &y_min, &x_max, &y_max)) {
-    // Init bounding box of the current word bounding box and from box file.
-    TBOX box = TBOX(ICOORD(x_min, y_min), ICOORD(x_max, y_max));
-    TBOX word_box(page_res_it.word()->word->bounding_box());
-    bool one_word = true;
-    // Check whether the bounding box of the next word overlaps with the
-    // current box from box file.
-    while (page_res_it.next_word() != NULL &&
-           box.x_overlap(page_res_it.next_word()->word->bounding_box())) {
-      word_box = word_box.bounding_union(
-          page_res_it.next_word()->word->bounding_box());
-      page_res_it.forward();
-      one_word = false;
-    }
-    if (!word_box.major_overlap(box)) {
-      if (!word_box.x_overlap(box)) {
-        // We must be looking at the word that belongs in the "next" bounding
-        // box from the box file. The ngram that was supposed to appear in
-        // the current box read from the box file must have been dropped by
-        // tesseract as noise.
-        tprintf("Word %s was dropped as noise.\n", label);
-        continue;  // stay on this blob, but read next box from box file
+ResultIterator::ResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
+                               int scale, int scaled_yres,
+                               int rect_left, int rect_top,
+                               int rect_width, int rect_height)
+  : PageIterator(page_res, tesseract, scale, scaled_yres,
+    rect_left, rect_top, rect_width, rect_height) {
+}
+
+ResultIterator::~ResultIterator() {
+}
+
+// Returns the null terminated UTF-8 encoded text string for the current
+// object at the given level. Use delete [] to free after use.
+char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
+  if (it_->word() == NULL) return NULL;  // Already at the end!
+  STRING text;
+  PAGE_RES_IT res_it(*it_);
+  WERD_CHOICE* best_choice = res_it.word()->best_choice;
+  ASSERT_HOST(best_choice != NULL);
+  switch (level) {
+    case RIL_BLOCK:
+    case RIL_PARA:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != NULL);
+        text += best_choice->unichar_string();
+        text += res_it.word()->word->flag(W_EOL) ? "\n" : " ";
+        res_it.forward();
+      } while (res_it.block() == res_it.prev_block());
+      break;
+    case RIL_TEXTLINE:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != NULL);
+        text += best_choice->unichar_string();
+        text += res_it.word()->word->flag(W_EOL) ? "\n" : " ";
+         res_it.forward();
+      } while (res_it.row() == res_it.prev_row());
+      break;
+    case RIL_WORD:
+      text = best_choice->unichar_string();
+      break;
+    case RIL_SYMBOL:
+      text = tesseract_->unicharset.id_to_unichar(
+          best_choice->unichar_id(blob_index_));
+  }
+  int length = text.length() + 1;
+  char* result = new char[length];
+  strncpy(result, text.string(), length);
+  return result;
+}
+
+// Returns the mean confidence of the current object at the given level.
+// The number should be interpreted as a percent probability. (0.0f-100.0f)
+float ResultIterator::Confidence(PageIteratorLevel level) const {
+  if (it_->word() == NULL) return 0.0f;  // Already at the end!
+  float mean_certainty = 0.0f;
+  int certainty_count = 0;
+  PAGE_RES_IT res_it(*it_);
+  WERD_CHOICE* best_choice = res_it.word()->best_choice;
+  ASSERT_HOST(best_choice != NULL);
+  switch (level) {
+    case RIL_BLOCK:
+    case RIL_PARA:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != NULL);
+        mean_certainty += best_choice->certainty();
+        ++certainty_count;
+        res_it.forward();
+      } while (res_it.block() == res_it.prev_block());
+      break;
+    case RIL_TEXTLINE:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != NULL);
+        mean_certainty += best_choice->certainty();
+        ++certainty_count;
+        res_it.forward();
+      } while (res_it.row() == res_it.prev_row());
+      break;
+    case RIL_WORD:
+      mean_certainty += best_choice->certainty();
+     ++certainty_count;
+      break;
+    case RIL_SYMBOL:
+      BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
+      if (choices != NULL) {
+        BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
+        for (int blob = 0; blob < blob_index_; ++blob)
+          blob_choices_it.forward();
+        BLOB_CHOICE_IT choice_it(blob_choices_it.data());
+        for (choice_it.mark_cycle_pt();
+             !choice_it.cycled_list();
+             choice_it.forward()) {
+          if (choice_it.data()->unichar_id() ==
+              best_choice->unichar_id(blob_index_))
+            break;
+        }
+        mean_certainty += choice_it.data()->certainty();
       } else {
-        tprintf("Error: Insufficient overlap for word box"
-                " and box from file for %s\n", label);
-        word_box.print();
-        box.print();
-        exit(1);
+        mean_certainty += best_choice->certainty();
       }
-    }
-    // Skip recognizing the ngram if tesseract is sure it's not
-    // one word, otherwise run one recognition pass on this word.
-    if (!one_word) {
-      tprintf("Tesseract segmented %s as multiple words\n", label);
-    } else {
-      ambigs_classify_and_output(&page_res_it, label, output_file);
-    }
-    page_res_it.forward();
-  }
-  fclose(box_file);
-}
-
-// Run classify_word_pass1() on the current word. Output tesseract's raw choice
-// as a result of the classification. For words labeled with a single unichar
-// also output all alternatives from blob_choices of the best choice.
-void Tesseract::ambigs_classify_and_output(PAGE_RES_IT *page_res_it,
-                                           const char *label,
-                                           FILE *output_file) {
-  int offset;
-  // Classify word.
-  classify_word_pass1(page_res_it->word(), page_res_it->row()->row,
-                      page_res_it->block()->block,
-                      FALSE, NULL, NULL);
-  WERD_CHOICE *best_choice = page_res_it->word()->best_choice;
-  ASSERT_HOST(best_choice != NULL);
-  ASSERT_HOST(best_choice->blob_choices() != NULL);
-
-  // Compute the number of unichars in the label.
-  int label_num_unichars = 0;
-  int step = 1;  // should be non-zero on the first iteration
-  for (offset = 0; label[offset] != '\0' && step > 0;
-       step = getDict().getUnicharset().step(label + offset),
-       offset += step, ++label_num_unichars);
-  if (step == 0) {
-    tprintf("Not outputting illegal unichar %s\n", label);
-    return;
-  }
-
-  // Output all classifier choices for the unigrams (1-1 classifications).
-  if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) {
-    BLOB_CHOICE_LIST_C_IT outer_blob_choice_it;
-    outer_blob_choice_it.set_to_list(best_choice->blob_choices());
-    BLOB_CHOICE_IT blob_choice_it;
-    blob_choice_it.set_to_list(outer_blob_choice_it.data());
-    for (blob_choice_it.mark_cycle_pt();
-         !blob_choice_it.cycled_list();
-         blob_choice_it.forward()) {
-      BLOB_CHOICE *blob_choice = blob_choice_it.data();
-      if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) {
-        fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n",
-               unicharset.id_to_unichar(blob_choice->unichar_id()),
-               label, blob_choice->rating(), blob_choice->certainty());
-      }
-    }
-  }
-  // Output the raw choice for succesful non 1-1 classifications.
-  getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars);
-}
-
-}  // namespace tesseract
+      ++certainty_count;
+  }
+  if (certainty_count > 0) {
+    mean_certainty /= certainty_count;
+    float confidence = 100 + 5 * mean_certainty;
+    if (confidence < 0.0f) confidence = 0.0f;
+    if (confidence > 100.0f) confidence = 100.0f;
+    return confidence;
+  }
+  return 0.0f;
+}
+
+// Returns the font attributes of the current word. If iterating at a higher
+// level object than words, eg textlines, then this will return the
+// attributes of the first word in that textline.
+// The actual return value is a string representing a font name. It points
+// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
+// the iterator itself, ie rendered invalid by various members of
+// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
+// Pointsize is returned in printers points (1/72 inch.)
+const char* ResultIterator::WordFontAttributes(bool* is_bold,
+                                               bool* is_italic,
+                                               bool* is_underlined,
+                                               bool* is_monospace,
+                                               bool* is_serif,
+                                               int* pointsize,
+                                               int* font_id) const {
+  if (it_->word() == NULL) return NULL;  // Already at the end!
+  *font_id = it_->word()->font1;
+  if (*font_id < 0) return NULL;  // No font available.
+  const UnicityTable<FontInfo> &font_table = tesseract_->get_fontinfo_table();
+  FontInfo font_info = font_table.get(*font_id);
+  *is_bold = font_info.is_bold();
+  *is_italic = font_info.is_italic();
+  *is_underlined = false;  // TODO(rays) fix this!
+  *is_monospace = font_info.is_fixed_pitch();
+  *is_serif = font_info.is_serif();
+  // The font size is calculated from a multiple of the x-height
+  // that came from the block.
+  float row_height = it_->row()->row->x_height() *
+      it_->block()->block->cell_over_xheight();
+  // Convert from pixels to printers points.
+  *pointsize = scaled_yres_ > 0
+    ? static_cast<int>(row_height * kPointsPerInch / scaled_yres_ + 0.5)
+    : 0;
+
+  return font_info.name;
+}
+
+// Returns true if the current word was found in a dictionary.
+bool ResultIterator::WordIsFromDictionary() const {
+  if (it_->word() == NULL) return false;  // Already at the end!
+  int permuter = it_->word()->best_choice->permuter();
+  return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM ||
+         permuter == USER_DAWG_PERM;
+}
+
+// Returns true if the current word is numeric.
+bool ResultIterator::WordIsNumeric() const {
+  if (it_->word() == NULL) return false;  // Already at the end!
+  int permuter = it_->word()->best_choice->permuter();
+  return permuter == NUMBER_PERM;
+}
+
+ChoiceIterator::ChoiceIterator(const ResultIterator& result_it) {
+  ASSERT_HOST(result_it.it_->word() != NULL);
+  tesseract_ = result_it.tesseract_;
+  PAGE_RES_IT res_it(*result_it.it_);
+  WERD_CHOICE* best_choice = res_it.word()->best_choice;
+  BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
+  if (choices != NULL) {
+    BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
+    for (int blob = 0; blob < result_it.blob_index_; ++blob)
+      blob_choices_it.forward();
+    choice_it_ = new BLOB_CHOICE_IT(blob_choices_it.data());
+    choice_it_->mark_cycle_pt();
+  } else {
+    choice_it_ = NULL;
+  }
+}
+
+ChoiceIterator::~ChoiceIterator() {
+  delete choice_it_;
+}
+
+// Moves to the next choice for the symbol and returns false if there
+// are none left.
+bool ChoiceIterator::Next() {
+  if (choice_it_ == NULL)
+    return false;
+  choice_it_->forward();
+  return !choice_it_->cycled_list();
+}
+
+// Returns the null terminated UTF-8 encoded text string for the current
+// choice. Use delete [] to free after use.
+const char* ChoiceIterator::GetUTF8Text() const {
+  if (choice_it_ == NULL)
+    return NULL;
+  UNICHAR_ID id = choice_it_->data()->unichar_id();
+  if (id < 0 || id >= tesseract_->unicharset.size() ||
+      id == INVALID_UNICHAR_ID)
+    return NULL;
+  return tesseract_->unicharset.id_to_unichar(id);
+}
+
+// Returns the confidence of the current choice.
+// The number should be interpreted as a percent probability. (0.0f-100.0f)
+float ChoiceIterator::Confidence() const {
+  if (choice_it_ == NULL)
+    return 0.0f;
+  float confidence = 100 + 5 * choice_it_->data()->certainty();
+  if (confidence < 0.0f) confidence = 0.0f;
+  if (confidence > 100.0f) confidence = 100.0f;
+  return confidence;
+}
+
+
+}  // namespace tesseract.
ccmain/applybox.h to api/pageiterator.h
--- a/ccmain/applybox.h
+++ b/api/pageiterator.h
@@ -1,84 +1,184 @@
-/**********************************************************************
- * File:        applybox.h  (Formerly applybox.h)
- * Description: Re segment rows according to box file data
- * Author:		Phil Cheatle
- * Created:		Wed Nov 24 09:11:23 GMT 1993
- *
- * (C) Copyright 1993, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
+///////////////////////////////////////////////////////////////////////
+// File:        pageiterator.h
+// Description: Iterator for tesseract page structure that avoids using
+//              tesseract internal data structures.
+// Author:      Ray Smith
+// Created:     Fri Feb 26 11:01:06 PST 2010
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
 
-#ifndef           APPLYBOX_H
-#define           APPLYBOX_H
+#ifndef TESSERACT_API_PAGEITERATOR_H__
+#define TESSERACT_API_PAGEITERATOR_H__
 
-#include          "varable.h"
-#include          "ocrblock.h"
-#include          "ocrrow.h"
-#include          "notdll.h"
-#include          "unichar.h"
+#include "apitypes.h"
 
-extern BOOL_VAR_H (applybox_rebalance, TRUE, "Drop dead");
-extern INT_VAR_H (applybox_debug, 0, "Debug level");
-extern INT_VAR_H (applybox_page, 0, "Page number to apply boxes from");
-extern STRING_VAR_H (applybox_test_exclusions, "|",
-                     "Chars ignored for testing");
-extern double_VAR_H (applybox_error_band, 0.15, "Err band as fract of xht");
-extern STRING_VAR_H(exposure_pattern, "exp",
-                    "Exposure value follows this pattern in the image"
-                    " filename. The name of the image files are expected"
-                    " to be in the form [lang].[fontname].exp[num].tif");
+class C_BLOB_IT;
+class PBLOB_IT;
+class PAGE_RES;
+class PAGE_RES_IT;
+class WERD;
+struct Pix;
 
-static const int kMinFragmentOutlineArea = 10;
+namespace tesseract {
 
-void apply_boxes(const STRING& filename,
-                 BLOCK_LIST *block_list    //real blocks
-                );
+class Tesseract;
 
-ROW *find_row_of_box(
-                     BLOCK_LIST *block_list,  //real blocks
-                     const TBOX &box,               //from boxfile
-                     inT16 &block_id,
-                     inT16 &row_id_to_process);
+// Class to iterate over tesseract page structure, providing access to all
+// levels of the page hierarchy, without including any tesseract headers or
+// having to handle any tesseract structures.
+// WARNING! This class points to data held within the TessBaseAPI class, and
+// therefore can only be used while the TessBaseAPI class still exists and
+// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+// DetectOS, or anything else that changes the internal PAGE_RES.
+// See apitypes.h for the definition of PageIteratorLevel.
+// See also ResultIterator, derived from PageIterator, which adds in the
+// ability to access OCR output with text-specific methods.
 
-inT16 resegment_box(
-                    ROW *row,
-                    TBOX &box,
-                    UNICHAR_ID uch_id,
-                    inT16 block_id,
-                    inT16 row_id,
-                    inT16 boxfile_lineno,
-                    inT16 boxfile_charno,
-                    inT16 *tgt_char_counts,
-                    bool learn_char_fragments,
-                    bool learning);
+class PageIterator {
+ public:
+  // page_res and tesseract come directly from the BaseAPI.
+  // The rectangle parameters are copied indirectly from the Thresholder,
+  // via the BaseAPI. They represent the coordinates of some rectangle in an
+  // original image (in top-left-origin coordinates) and therefore the top-left
+  // needs to be added to any output boxes in order to specify coordinates
+  // in the original image. See TessBaseAPI::SetRectangle.
+  // The scale and scaled_yres are in case the Thresholder scaled the image
+  // rectangle prior to thresholding. Any coordinates in tesseract's image
+  // must be divided by scale before adding (rect_left, rect_top).
+  // The scaled_yres indicates the effective resolution of the binary image
+  // that tesseract has been given by the Thresholder.
+  // After the constructor, Begin has already been called.
+  PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
+               int scale, int scaled_yres,
+               int rect_left, int rect_top,
+               int rect_width, int rect_height);
+  virtual ~PageIterator();
 
-void tidy_up(
-             BLOCK_LIST *block_list,  //real blocks
-             inT16 &ok_char_count,
-             inT16 &ok_row_count,
-             inT16 &unlabelled_words,
-             inT16 *tgt_char_counts,
-             inT16 &rebalance_count,
-             UNICHAR_ID *min_uch_id,
-             inT16 &min_samples,
-             inT16 &final_labelled_blob_count,
-             bool learn_character_fragments,
-             bool learning);
+  // Page/ResultIterators may be copied! This makes it possible to iterate over
+  // all the objects at a lower level, while maintaining an iterator to
+  // objects at a higher level. These constructors DO NOT CALL Begin, so
+  // iterations will continue from the location of src.
+  PageIterator(const PageIterator& src);
+  const PageIterator& operator=(const PageIterator& src);
 
-void report_failed_box(inT16 boxfile_lineno,
-                       inT16 boxfile_charno,
-                       TBOX box,
-                       const char *box_ch,
-                       const char *err_msg);
+  // ============= Moving around within the page ============.
 
-void apply_box_training(const STRING& filename, BLOCK_LIST *block_list);
-#endif
+  // Moves the iterator to point to the start of the page to begin an iteration.
+  void Begin();
+
+  // Moves to the start of the next object at the given level in the
+  // page hierarchy, and returns false if the end of the page was reached.
+  // NOTE that RIL_SYMBOL will skip non-text blocks, but all other
+  // PageIteratorLevel level values will visit each non-text block once.
+  // Think of non text blocks as containing a single para, with a single line,
+  // with a single imaginary word.
+  // Calls to Next with different levels may be freely intermixed.
+  // This function iterates words in right-to-left scripts correctly, if
+  // the appropriate language has been loaded into Tesseract.
+  bool Next(PageIteratorLevel level);
+
+  // Returns true if the iterator is at the start of an object at the given
+  // level. Possible uses include determining if a call to Next(RIL_WORD)
+  // moved to the start of a RIL_PARA.
+  bool IsAtBeginningOf(PageIteratorLevel level) const;
+
+  // Returns whether the iterator is positioned at the last element in a
+  // given level. (e.g. the last word in a line, the last line in a block)
+  bool IsAtFinalElement(PageIteratorLevel level,
+                        PageIteratorLevel element) const;
+
+  // ============= Accessing data ==============.
+  // Coordinate system:
+  // Integer coordinates are at the cracks between the pixels.
+  // The top-left corner of the top-left pixel in the image is at (0,0).
+  // The bottom-right corner of the bottom-right pixel in the image is at
+  // (width, height).
+  // Every bounding box goes from the top-left of the top-left contained
+  // pixel to the bottom-right of the bottom-right contained pixel, so
+  // the bounding box of the single top-left pixel in the image is:
+  // (0,0)->(1,1).
+  // If an image rectangle has been set in the API, then returned coordinates
+  // relate to the original (full) image, rather than the rectangle.
+
+  // Returns the bounding rectangle of the current object at the given level.
+  // See comment on coordinate system above.
+  // Returns false if there is no such object at the current position.
+  // The returned bounding box is guaranteed to match the size and position
+  // of the image returned by GetBinaryImage, but may clip foreground pixels
+  // from a grey image. The padding argument to GetImage can be used to expand
+  // the image to include more foreground pixels. See GetImage below.
+  bool BoundingBox(PageIteratorLevel level,
+                   int* left, int* top, int* right, int* bottom) const;
+
+  // Returns the type of the current block. See apitypes.h for PolyBlockType.
+  PolyBlockType BlockType() const;
+
+  // Returns a binary image of the current object at the given level.
+  // The position and size match the return from BoundingBox.
+  // Use pixDestroy to delete the image after use.
+  Pix* GetBinaryImage(PageIteratorLevel level) const;
+
+  // Returns an image of the current object at the given level in greyscale
+  // if available in the input. To guarantee a binary image use BinaryImage.
+  // NOTE that in order to give the best possible image, the bounds are
+  // expanded slightly over the binary connected component, by the supplied
+  // padding, so the top-left position of the returned image is returned
+  // in (left,top). These will most likely not match the coordinates
+  // returned by BoundingBox.
+  // Use pixDestroy to delete the image after use.
+  Pix* GetImage(PageIteratorLevel level, int padding,
+                int* left, int* top) const;
+
+  // Returns the baseline of the current object at the given level.
+  // The baseline is the line that passes through (x1, y1) and (x2, y2).
+  // WARNING: with vertical text, baselines may be vertical!
+  // Returns false if there is no baseline at the current position.
+  bool Baseline(PageIteratorLevel level,
+                int* x1, int* y1, int* x2, int* y2) const;
+
+ protected:
+  // Sets up the internal data for iterating the blobs of a new word, then
+  // moves the iterator to the given offset.
+  void BeginWord(int offset);
+
+  // Pointer to the page_res owned by the API.
+  PAGE_RES* page_res_;
+  // Pointer to the Tesseract object owned by the API.
+  Tesseract* tesseract_;
+  // The iterator to the page_res_. Owned by this ResultIterator.
+  // A pointer just to avoid dragging in Tesseract includes.
+  PAGE_RES_IT* it_;
+  // The current input WERD being iterated. If there is an output from OCR,
+  // then word_ is NULL. Owned by the API.
+  WERD* word_;
+  // The length of the current word_.
+  int word_length_;
+  // The current blob index within the word.
+  int blob_index_;
+  // Iterator to the blobs within the word. If NULL, then we are iterating
+  // OCR results in the box_word.
+  // Owned by this ResultIterator.
+  C_BLOB_IT* cblob_it_;
+  // Parameters saved from the Thresholder. Needed to rebuild coordinates.
+  int scale_;
+  int scaled_yres_;
+  int rect_left_;
+  int rect_top_;
+  int rect_width_;
+  int rect_height_;
+};
+
+}  // namespace tesseract.
+
+#endif  // TESSERACT_API_PAGEITERATOR_H__
1 2 3 .. 26 > >> (Page 1 of 26)