Commit [67bcbb] Maximize Restore History

Remaining misc changes for 3.02

git-svn-id: http://tesseract-ocr.googlecode.com/svn/trunk@658 d0cd1f9f-072b-0410-8dd7-cf729c803f20

theraysmith@gmail.com theraysmith@gmail.com 2012-02-02

1 2 > >> (Page 1 of 2)
added ccutil/bitvector.cpp
added ccutil/bitvector.h
added ccutil/indexmapbidi.cpp
added ccutil/unicodes.h
changed ChangeLog
changed ReleaseNotes
changed ccutil
changed ccutil/Makefile.am
changed ccutil/ambigs.cpp
changed ccutil/ccutil.cpp
changed ccutil/clst.cpp
changed ccutil/clst.h
changed ccutil/errcode.cpp
changed ccutil/errcode.h
changed ccutil/genericvector.h
changed ccutil/helpers.h
changed ccutil/host.h
changed ccutil/ocrclass.h
changed ccutil/params.cpp
changed ccutil/params.h
changed ccutil/strngs.cpp
changed ccutil/strngs.h
changed ccutil/tesscallback.h
changed ccutil/tessdatamanager.cpp
changed ccutil/tessdatamanager.h
changed ccutil/tprintf.cpp
changed ccutil/unichar.h
changed ccutil/unicharset.cpp
changed ccutil/unicharset.h
changed ccutil/unicity_table.h
changed configure.ac
changed cutil
changed cutil/freelist.cpp
changed cutil/freelist.h
changed image
changed image/image.h
changed image/svshowim.cpp
changed viewer
changed viewer/scrollview.cpp
changed viewer/scrollview.h
changed viewer/svutil.cpp
copied ccutil/boxread.cpp -> ccutil/indexmapbidi.h
copied ccutil/boxread.h -> ccutil/unicodes.cpp
ccutil/bitvector.cpp Diff Switch to side-by-side view
Loading...
ccutil/bitvector.h Diff Switch to side-by-side view
Loading...
ccutil/indexmapbidi.cpp Diff Switch to side-by-side view
Loading...
ccutil/unicodes.h Diff Switch to side-by-side view
Loading...
ChangeLog Diff Switch to side-by-side view
Loading...
ReleaseNotes Diff Switch to side-by-side view
Loading...
ccutil
Directory.
ccutil/Makefile.am Diff Switch to side-by-side view
Loading...
ccutil/ambigs.cpp Diff Switch to side-by-side view
Loading...
ccutil/ccutil.cpp Diff Switch to side-by-side view
Loading...
ccutil/clst.cpp Diff Switch to side-by-side view
Loading...
ccutil/clst.h Diff Switch to side-by-side view
Loading...
ccutil/errcode.cpp Diff Switch to side-by-side view
Loading...
ccutil/errcode.h Diff Switch to side-by-side view
Loading...
ccutil/genericvector.h Diff Switch to side-by-side view
Loading...
ccutil/helpers.h Diff Switch to side-by-side view
Loading...
ccutil/host.h Diff Switch to side-by-side view
Loading...
ccutil/ocrclass.h Diff Switch to side-by-side view
Loading...
ccutil/params.cpp Diff Switch to side-by-side view
Loading...
ccutil/params.h Diff Switch to side-by-side view
Loading...
ccutil/strngs.cpp Diff Switch to side-by-side view
Loading...
ccutil/strngs.h Diff Switch to side-by-side view
Loading...
ccutil/tesscallback.h Diff Switch to side-by-side view
Loading...
ccutil/tessdatamanager.cpp Diff Switch to side-by-side view
Loading...
ccutil/tessdatamanager.h Diff Switch to side-by-side view
Loading...
ccutil/tprintf.cpp Diff Switch to side-by-side view
Loading...
ccutil/unichar.h Diff Switch to side-by-side view
Loading...
ccutil/unicharset.cpp Diff Switch to side-by-side view
Loading...
ccutil/unicharset.h Diff Switch to side-by-side view
Loading...
ccutil/unicity_table.h Diff Switch to side-by-side view
Loading...
configure.ac Diff Switch to side-by-side view
Loading...
cutil
Directory.
cutil/freelist.cpp Diff Switch to side-by-side view
Loading...
cutil/freelist.h Diff Switch to side-by-side view
Loading...
image
Directory.
image/image.h Diff Switch to side-by-side view
Loading...
image/svshowim.cpp Diff Switch to side-by-side view
Loading...
viewer
Directory.
viewer/scrollview.cpp Diff Switch to side-by-side view
Loading...
viewer/scrollview.h Diff Switch to side-by-side view
Loading...
viewer/svutil.cpp Diff Switch to side-by-side view
Loading...
ccutil/boxread.cpp to ccutil/indexmapbidi.h
--- a/ccutil/boxread.cpp
+++ b/ccutil/indexmapbidi.h
@@ -1,138 +1,180 @@
-/**********************************************************************
- * File:        boxread.cpp
- * Description: Read data from a box file.
- * Author:      Ray Smith
- * Created:     Fri Aug 24 17:47:23 PDT 2007
- *
- * (C) Copyright 2007, Google Inc.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
+///////////////////////////////////////////////////////////////////////
+// File:        indexmapbidi.h
+// Description: Bi-directional mapping between a sparse and compact space.
+// Author:      rays@google.com (Ray Smith)
+// Created:     Tue Apr 06 11:33:59 PDT 2010
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
 
-#include "mfcpch.h"
-#include <string.h>
-#include "boxread.h"
-#include "unichar.h"
-#include "tprintf.h"
+#ifndef TESSERACT_CCUTIL_INDEXMAPBIDI_H_
+#define TESSERACT_CCUTIL_INDEXMAPBIDI_H_
 
-// Special char code used to identify multi-blob labels.
-static const char* kMultiBlobLabelCode = "WordStr";
+#include <stdio.h>
+#include "genericvector.h"
 
-// Open the boxfile based on the given image filename.
-FILE* OpenBoxFile(const STRING& fname) {
-  STRING filename = fname;
-  const char *lastdot = strrchr(filename.string(), '.');
-  if (lastdot != NULL)
-    filename[lastdot - filename.string()] = '\0';
+namespace tesseract {
 
-  filename += ".box";
-  FILE* box_file = NULL;
-  if (!(box_file = fopen(filename.string(), "rb"))) {
-    CANTOPENFILE.error("read_next_box", TESSEXIT,
-                       "Cant open box file %s",
-                       filename.string());
+class IndexMapBiDi;
+
+// Bidirectional one-to-one mapping between a sparse and a compact discrete
+// space. Many entries in the sparse space are unmapped, but those that are
+// mapped have a 1-1 mapping to (and from) the compact space, where all
+// values are used. This is useful for forming subsets of larger collections,
+// such as subsets of character sets, or subsets of binary feature spaces.
+//
+// This base class provides basic functionality with binary search for the
+// SparseToCompact mapping to save memory.
+// For a faster inverse mapping, or to allow a many-to-one mapping, use
+// IndexMapBiDi below.
+// NOTE: there are currently no methods to setup an IndexMap on its own!
+// It must be initialized by copying from an IndexMapBiDi or by DeSerialize.
+class IndexMap {
+ public:
+  virtual ~IndexMap() {}
+
+  // SparseToCompact takes a sparse index to an index in the compact space.
+  // Uses a binary search to find the result. For faster speed use
+  // IndexMapBiDi, but that takes more memory.
+  virtual int SparseToCompact(int sparse_index) const;
+
+  // CompactToSparse takes a compact index to the corresponding index in the
+  // sparse space.
+  int CompactToSparse(int compact_index) const {
+    return compact_map_[compact_index];
   }
-  return box_file;
-}
+  // The size of the sparse space.
+  virtual int SparseSize() const {
+    return sparse_size_;
+  }
+  // The size of the compact space.
+  int CompactSize() const {
+    return compact_map_.size();
+  }
 
-// Box files are used ONLY DURING TRAINING, but by both processes of
-// creating tr files with tesseract, and unicharset_extractor.
-// read_next_box factors out the code to interpret a line of a box
-// file so that applybox and unicharset_extractor interpret the same way.
-// This function returns the next valid box file utf8 string and coords
-// and returns true, or false on eof (and closes the file).
-// It ignores the uft8 file signature, checks for valid utf-8 and allows
-// space or tab between fields.
-// utf8_str must be at least kBoxReadBufSize in length.
-// If there are page numbers in the file, it reads them all.
-bool read_next_box(int *line_number, FILE* box_file, char* utf8_str,
-                   int* x_min, int* y_min, int* x_max, int* y_max) {
-  return read_next_box(-1, line_number, box_file, utf8_str,
-                       x_min, y_min, x_max, y_max);
-}
+  // Copy from the input.
+  void CopyFrom(const IndexMap& src);
+  void CopyFrom(const IndexMapBiDi& src);
 
-// As read_next_box above, but get a specific page number. (0-based)
-// Use -1 to read any page number. Files without page number all
-// read as if they are page 0.
-bool read_next_box(int target_page, int *line_number,
-                   FILE* box_file, char* utf8_str,
-                   int* x_min, int* y_min, int* x_max, int* y_max) {
-  int count = 0;
-  int page = 0;
-  char buff[kBoxReadBufSize];   // boxfile read buffer
-  char uch[kBoxReadBufSize];
-  char *buffptr = buff;
+  // Writes to the given file. Returns false in case of error.
+  bool Serialize(FILE* fp) const;
+  // Reads from the given file. Returns false in case of error.
+  // If swap is true, assumes a big/little-endian swap is needed.
+  bool DeSerialize(bool swap, FILE* fp);
 
-  while (fgets(buff, sizeof(buff) - 1, box_file)) {
-    (*line_number)++;
+ protected:
+  // The sparse space covers integers in the range [0, sparse_size_-1].
+  int sparse_size_;
+  // The compact space covers integers in the range [0, compact_map_.size()-1].
+  // Each element contains the corresponding sparse index.
+  GenericVector<inT32> compact_map_;
+};
 
-    buffptr = buff;
-    const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
-    if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
-      buffptr += 3;  // Skip unicode file designation.
-    // Check for blank lines in box file
-    while (*buffptr == ' ' || *buffptr == '\t')
-      buffptr++;
-    if (*buffptr != '\0') {
-      // Read the unichar without messing up on Tibetan.
-      // According to issue 253 the utf-8 surrogates 85 and A0 are treated
-      // as whitespace by sscanf, so it is more reliable to just find
-      // ascii space and tab.
-      int uch_len = 0;
-      while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t')
-        uch[uch_len++] = *buffptr++;
-      uch[uch_len] = '\0';
-      if (*buffptr != '\0') ++buffptr;
-      count = sscanf(buffptr, "%d %d %d %d %d",
-                     x_min, y_min, x_max, y_max, &page);
-      if (count != 5) {
-        if (target_page <= 0) {
-          // If target_page is negative or zero, allow lines with no page number
-          page = 0;
-          count = sscanf(buffptr, "%d %d %d %d", x_min, y_min, x_max, y_max);
-        } else {
-          tprintf("Box file format error on line %i; ignored\n", *line_number);
-          continue;
-        }
-      }
-      if (target_page >= 0 && target_page != page)
-        continue;  // Not on the appropriate page.
-      // Test for long space-delimited string label.
-      if (strcmp(uch, kMultiBlobLabelCode) == 0 &&
-          (buffptr = strchr(buffptr, '#')) != NULL) {
-        strcpy(uch, buffptr + 1);
-        chomp_string(uch);
-        uch_len = strlen(uch);
-      }
-      // Validate UTF8 by making unichars with it.
-      int used = 0;
-      while (used < uch_len) {
-        UNICHAR ch(uch + used, uch_len - used);
-        int new_used = ch.utf8_len();
-        if (new_used == 0) {
-          tprintf("Bad UTF-8 str %s starts with 0x%02x at line %d, col %d\n",
-                  uch + used, uch[used], *line_number, used + 1);
-          count = 0;
-          break;
-        }
-        used += new_used;
-      }
-      if (count < 4 || used == 0) {
-        tprintf("Box file format error on line %i; ignored\n", *line_number);
-      } else {
-        strncpy(utf8_str, uch, kBoxReadBufSize);
-        return true;  // Successfully read a box.
-      }
-    }
+// Bidirectional many-to-one mapping between a sparse and a compact discrete
+// space. As with IndexMap, many entries may be unmapped, but unlike IndexMap,
+// of those that are, many may be mapped to the same compact index.
+// If the map is many-to-one, it is not possible to directly obtain all the
+// sparse indices that map to a single compact index.
+// This map is time- rather than space-efficient. It stores the entire sparse
+// space.
+// IndexMapBiDi may be initialized in one of 3 ways:
+// 1. Init(size, true);
+//    Setup();
+//    Sets a complete 1:1 mapping with no unmapped elements.
+// 2. Init(size, false);
+//    for ... SetMap(index, true);
+//    Setup();
+//    Specifies precisely which sparse indices are mapped. The mapping is 1:1.
+// 3. Either of the above, followed by:
+//    for ... Merge(index1, index2);
+//    CompleteMerges();
+//    Allows a many-to-one mapping by merging compact space indices.
+class IndexMapBiDi : public IndexMap {
+ public:
+  virtual ~IndexMapBiDi() {}
+
+  // Top-level init function in a single call to initialize a map to select
+  // a single contiguous subrange [start, end) of the sparse space to be mapped
+  // 1 to 1 to the compact space, with all other elements of the sparse space
+  // left unmapped.
+  // No need to call Setup after this.
+  void InitAndSetupRange(int sparse_size, int start, int end);
+
+  // Initializes just the sparse_map_ to the given size with either all
+  // forward indices mapped (all_mapped = true) or none (all_mapped = false).
+  // Call Setup immediately after, or make calls to SetMap first to adjust the
+  // mapping and then call Setup before using the map.
+  void Init(int size, bool all_mapped);
+  // Sets a given index in the sparse_map_ to be mapped or not.
+  void SetMap(int sparse_index, bool mapped);
+  // Sets up the sparse_map_ and compact_map_ properly after Init and
+  // some calls to SetMap. Assumes an ordered 1-1 map from set indices
+  // in the sparse space to the compact space.
+  void Setup();
+
+  // Merges the two compact space indices. May be called many times, but
+  // the merges must be concluded by a call to CompleteMerges.
+  // Returns true if a merge was actually performed.
+  bool Merge(int compact_index1, int compact_index2);
+  // Returns true if the given compact index has been deleted.
+  bool IsCompactDeleted(int index) const {
+    return MasterCompactIndex(index) < 0;
   }
-  fclose(box_file);
-  return false;  // EOF
-}
+  // Completes one or more Merge operations by further compacting the
+  // compact space.
+  void CompleteMerges();
+
+  // SparseToCompact takes a sparse index to an index in the compact space.
+  virtual int SparseToCompact(int sparse_index) const {
+    return sparse_map_[sparse_index];
+  }
+  // The size of the sparse space.
+  virtual int SparseSize() const {
+    return sparse_map_.size();
+  }
+
+  // Copy from the input.
+  void CopyFrom(const IndexMapBiDi& src);
+
+  // Writes to the given file. Returns false in case of error.
+  bool Serialize(FILE* fp) const;
+  // Reads from the given file. Returns false in case of error.
+  // If swap is true, assumes a big/little-endian swap is needed.
+  bool DeSerialize(bool swap, FILE* fp);
+
+  // Bulk calls to SparseToCompact.
+  // Maps the given array of sparse indices to an array of compact indices.
+  // Assumes the input is sorted. The output indices are sorted and uniqued.
+  // Return value is the number of "missed" features, being features that
+  // don't map to the compact feature space.
+  int MapFeatures(const GenericVector<int>& sparse,
+                  GenericVector<int>* compact) const;
+
+ private:
+  // Returns the master compact index for a given compact index.
+  // During a multiple merge operation, several compact indices may be
+  // combined, so we need to be able to find the master of all.
+  int MasterCompactIndex(int compact_index) const {
+    while (compact_index >= 0 &&
+           sparse_map_[compact_map_[compact_index]] != compact_index)
+      compact_index = sparse_map_[compact_map_[compact_index]];
+    return compact_index;
+  }
+
+  // Direct look-up of the compact index for each element in sparse space.
+  GenericVector<inT32> sparse_map_;
+};
+
+}  // namespace tesseract.
+
+#endif  // TESSERACT_CCUTIL_INDEXMAPBIDI_H_
ccutil/boxread.h to ccutil/unicodes.cpp
--- a/ccutil/boxread.h
+++ b/ccutil/unicodes.cpp
@@ -1,10 +1,10 @@
 /**********************************************************************
- * File:        boxread.cpp
- * Description: Read data from a box file.
- * Author:		Ray Smith
- * Created:		Fri Aug 24 17:47:23 PDT 2007
+ * File:        unicodes.h
+ * Description: Unicode related machinery
+ * Author:      David Eger
+ * Created:     Wed Jun 15 16:37:50 PST 2011
  *
- * (C) Copyright 2007, Google Inc.
+ * (C) Copyright 2011, Google, Inc.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
@@ -17,32 +17,41 @@
  *
  **********************************************************************/
 
-#ifndef TESSERACT_CCUTIL_BOXREAD_H__
-#define TESSERACT_CCUTIL_BOXREAD_H__
+#include "unicodes.h"
+#include "host.h"  // for NULL
 
-#include <stdio.h>
-#include "strngs.h"
+namespace tesseract {
 
-// Size of buffer used to read a line from a box file.
-const int kBoxReadBufSize = 1024;
+const char *kUTF8LineSeparator = "\u2028";  // "\xe2\x80\xa8";
+const char *kUTF8ParagraphSeparator = "\u2029";  // "\xe2\x80\xa9";
+const char *kLRM = "\u200E";  // Left-to-Right Mark
+const char *kRLM = "\u200F";  // Right-to-Left Mark
+const char *kRLE = "\u202A";  // Right-to-Left Embedding
+const char *kPDF = "\u202C";  // Pop Directional Formatting
 
-// Open the boxfile based on the given image filename.
-FILE* OpenBoxFile(const STRING& fname);
+const char *kHyphenLikeUTF8[] = {
+  "-",       // ASCII hyphen-minus
+  "\u05BE",  // word hyphen in hybrew
+  "\u2010",  // hyphen
+  "\u2011",  // non-breaking hyphen
+  "\u2012",  // a hyphen the same width as digits
+  "\u2013",  // en dash
+  "\u2014",  // em dash
+  "\u2015",  // horizontal bar
+  "\u2212",  // arithmetic minus sign
+  "\uFE58",  // small em dash
+  "\uFE63",  // small hyphen-minus
+  "\uFF0D",  // fullwidth hyphen-minus
+  NULL,      // end of our list
+};
 
-// read_next_box factors out the code to interpret a line of a box
-// file so that applybox and unicharset_extractor interpret the same way.
-// This function returns the next valid box file utf8 string and coords
-// and returns true, or false on eof (and closes the file).
-// If ignores the uft8 file signature, checks for valid utf-8 and allows
-// space or tab between fields.
-// utf8_str must be at least kBoxReadBufSize in length.
-// If there are page numbers in the file, it reads them all.
-bool read_next_box(int *line_number, FILE* box_file, char* utf8_str,
-                   int* x_min, int* y_min, int* x_max, int* y_max);
-// As read_next_box above, but get a specific page number. (0-based)
-// Use -1 to read any page number. Files without page number all
-// read as if they are page 0.
-bool read_next_box(int page, int *line_number, FILE* box_file, char* utf8_str,
-                   int* x_min, int* y_min, int* x_max, int* y_max);
+const char *kApostropheLikeUTF8[] = {
+  "'",       // ASCII apostrophe
+  "`",       // ASCII backtick
+  "\u2018",  // opening single quote
+  "\u2019",  // closing single quote
+  "\u2032",  // mathematical prime mark
+  NULL,      // end of our list.
+};
 
-#endif  // TESSERACT_CCUTIL_BOXREAD_H__
+}  // namespace
1 2 > >> (Page 1 of 2)