Learn how easy it is to sync an existing GitHub or Google Code repo to a SourceForge project! See Demo

Close

Commit [4dd453] Maximize Restore History

Merge compression updates

- New internal match-finding API (might release as stand-alone library
sometime)
- Add some new match-finding algorithms
- Get rid of lz_hash.c / lz_analyze_block()
- Add optimal parsing to XPRESS
- Optimize get_matches() / skip_bytes() calls in XPRESS and LZX
compressors
- Get rid of decompressor parameters
- Get rid of compressor parameters exposed in API (use compression levels
instead)

Eric Biggers Eric Biggers 2014-07-19

1 2 3 > >> (Page 1 of 3)
added examples/updatewim.c
added include/wimlib/divsufsort.h
added include/wimlib/lz_mf.h
added src/divsufsort.c
added src/lz_brute_force.c
added src/lz_hash_chains.c
added src/lz_lcp_interval_tree.c
added src/lz_linked_suffix_array.c
added src/lz_suffix_array_utils.c
changed Makefile.am
changed NEWS
changed README
changed doc
changed doc/man1
changed doc/man1/imagex-capture.1.in
changed doc/man1/imagex-export.1.in
changed doc/man1/imagex-extract.1.in
changed doc/man1/imagex-mount.1.in
changed doc/man1/imagex-optimize.1.in
changed examples
changed examples/Makefile
changed examples/compressfile.c
changed examples/decompressfile.c
changed include
changed include/wimlib
changed include/wimlib.h
changed include/wimlib/compressor_ops.h
changed include/wimlib/decompressor_ops.h
changed include/wimlib/lzx.h
changed programs
changed programs/imagex.c
changed src
changed src/compress.c
changed src/compress_parallel.c
changed src/compress_serial.c
changed src/decompress.c
changed src/lzms-compress.c
changed src/lzms-decompress.c
changed src/lzx-compress.c
changed src/lzx-decompress.c
changed src/resource.c
changed src/wim.c
changed src/xpress-compress.c
copied include/wimlib/lz.h -> include/wimlib/lz_mf_ops.h
copied include/wimlib/lz_bt.h -> src/lz_null.c
copied include/wimlib/lz_hash.h -> include/wimlib/lz_suffix_array_utils.h
copied src/lz_bt.c -> src/lz_binary_trees.c
copied src/lz_hash.c -> src/lz_mf.c
examples/updatewim.c Diff Switch to side-by-side view
Loading...
include/wimlib/divsufsort.h Diff Switch to side-by-side view
Loading...
include/wimlib/lz_mf.h Diff Switch to side-by-side view
Loading...
src/divsufsort.c Diff Switch to side-by-side view
Loading...
src/lz_brute_force.c Diff Switch to side-by-side view
Loading...
src/lz_hash_chains.c Diff Switch to side-by-side view
Loading...
src/lz_lcp_interval_tree.c Diff Switch to side-by-side view
Loading...
src/lz_linked_suffix_array.c Diff Switch to side-by-side view
Loading...
src/lz_suffix_array_utils.c Diff Switch to side-by-side view
Loading...
Makefile.am Diff Switch to side-by-side view
Loading...
NEWS Diff Switch to side-by-side view
Loading...
README Diff Switch to side-by-side view
Loading...
doc
Directory.
doc/man1
Directory.
doc/man1/imagex-capture.1.in Diff Switch to side-by-side view
Loading...
doc/man1/imagex-export.1.in Diff Switch to side-by-side view
Loading...
doc/man1/imagex-extract.1.in Diff Switch to side-by-side view
Loading...
doc/man1/imagex-mount.1.in Diff Switch to side-by-side view
Loading...
doc/man1/imagex-optimize.1.in Diff Switch to side-by-side view
Loading...
examples
Directory.
examples/Makefile Diff Switch to side-by-side view
Loading...
examples/compressfile.c Diff Switch to side-by-side view
Loading...
examples/decompressfile.c Diff Switch to side-by-side view
Loading...
include
Directory.
include/wimlib
Directory.
include/wimlib.h Diff Switch to side-by-side view
Loading...
include/wimlib/compressor_ops.h Diff Switch to side-by-side view
Loading...
include/wimlib/decompressor_ops.h Diff Switch to side-by-side view
Loading...
include/wimlib/lzx.h Diff Switch to side-by-side view
Loading...
programs
Directory.
programs/imagex.c Diff Switch to side-by-side view
Loading...
src
Directory.
src/compress.c Diff Switch to side-by-side view
Loading...
src/compress_parallel.c Diff Switch to side-by-side view
Loading...
src/compress_serial.c Diff Switch to side-by-side view
Loading...
src/decompress.c Diff Switch to side-by-side view
Loading...
src/lzms-compress.c Diff Switch to side-by-side view
Loading...
src/lzms-decompress.c Diff Switch to side-by-side view
Loading...
src/lzx-compress.c Diff Switch to side-by-side view
Loading...
src/lzx-decompress.c Diff Switch to side-by-side view
Loading...
src/resource.c Diff Switch to side-by-side view
Loading...
src/wim.c Diff Switch to side-by-side view
Loading...
src/xpress-compress.c Diff Switch to side-by-side view
Loading...
include/wimlib/lz.h to include/wimlib/lz_mf_ops.h
--- a/include/wimlib/lz.h
+++ b/include/wimlib/lz_mf_ops.h
@@ -1,27 +1,8 @@
-#ifndef _WIMLIB_LZ_H
-#define _WIMLIB_LZ_H
+#include "wimlib/lz_mf.h"
 
-#include "wimlib/compress_common.h"
-
-//#define ENABLE_LZ_DEBUG
-#ifdef ENABLE_LZ_DEBUG
-#  define LZ_ASSERT wimlib_assert
-#  include "wimlib/assert.h"
-#else
-#  define LZ_ASSERT(...)
-#endif
-
-
-/* Raw LZ match/literal format: just a length and offset.
- *
- * The length is the number of bytes of the match, and the offset is the number
- * of bytes back in the input the match is from the current position.
- *
- * This can alternatively be used to represent a literal byte if @len is less
- * than the minimum match length.  */
-struct lz_match {
-	u32 len;
-	u32 offset;
-};
-
-#endif /* _WIMLIB_LZ_H */
+extern const struct lz_mf_ops lz_null_ops;
+extern const struct lz_mf_ops lz_brute_force_ops;
+extern const struct lz_mf_ops lz_hash_chains_ops;
+extern const struct lz_mf_ops lz_binary_trees_ops;
+extern const struct lz_mf_ops lz_lcp_interval_tree_ops;
+extern const struct lz_mf_ops lz_linked_suffix_array_ops;
include/wimlib/lz_bt.h to src/lz_null.c
--- a/include/wimlib/lz_bt.h
+++ b/src/lz_null.c
@@ -1,83 +1,94 @@
 /*
- * lz_bt.h
+ * lz_null.c
  *
- * Binary tree match-finder for Lempel-Ziv compression.
+ * Dummy "match-finder" for Lempel-Ziv compression.
  *
- * Author:  Eric Biggers
- * Year:    2014
+ * Copyright (c) 2014 Eric Biggers.  All rights reserved.
  *
- * The author dedicates this file to the public domain.
- * You can do whatever you want with this file.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef _WIMLIB_LZ_BT_H
-#define _WIMLIB_LZ_BT_H
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
 
-#include "wimlib/types.h"
+#include "wimlib/lz_mf.h"
 
-/* Position type for the binary tree match-finder.
- * This can be changed to 'u16' if no window will exceed 65536 bytes.  */
-typedef u32 lz_bt_pos_t;
-
-/* Match length type for the binary tree match-finder.  */
-typedef unsigned lz_bt_len_t;
-
-/* The binary tree match-finder structure.  */
-struct lz_bt {
-	lz_bt_pos_t *hash_tab;
-	lz_bt_pos_t *digram_tab;
-	lz_bt_pos_t *child_tab;
-	const u8 *cur_window;
-	lz_bt_pos_t cur_window_pos;
-	lz_bt_pos_t cur_window_size;
-	lz_bt_pos_t max_window_size;
-	lz_bt_len_t min_match_len;
-	lz_bt_len_t max_match_len;
-	lz_bt_len_t num_fast_bytes;
-	u32 max_search_depth;
-};
-
-struct lz_match;
-
-extern u64
-lz_bt_get_needed_memory(lz_bt_pos_t max_window_size);
-
-extern bool
-lz_bt_init(struct lz_bt *mf,
-	   lz_bt_pos_t max_window_size,
-	   lz_bt_len_t min_match_len,
-	   lz_bt_len_t max_match_len,
-	   lz_bt_len_t num_fast_bytes,
-	   u32 max_search_depth);
-
-extern void
-lz_bt_load_window(struct lz_bt *mf, const u8 *window, lz_bt_pos_t window_size);
-
-extern lz_bt_len_t
-lz_bt_get_matches(struct lz_bt *mf, struct lz_match *matches);
-
-static inline lz_bt_pos_t
-lz_bt_get_position(const struct lz_bt *mf)
+static bool
+lz_null_params_valid(const struct lz_mf_params *_params)
 {
-	return mf->cur_window_pos;
+	return true;
 }
 
-static inline const u8 *
-lz_bt_get_window_ptr(const struct lz_bt *mf)
+static u64
+lz_null_get_needed_memory(u32 max_window_size)
 {
-	return &mf->cur_window[mf->cur_window_pos];
+	return 0;
 }
 
-static inline lz_bt_pos_t
-lz_bt_get_remaining_size(const struct lz_bt *mf)
+static bool
+lz_null_init(struct lz_mf *mf)
 {
-	return mf->cur_window_size - mf->cur_window_pos;
+	if (mf->params.min_match_len == 0)
+		mf->params.min_match_len = 2;
+
+	if (mf->params.max_match_len == 0)
+		mf->params.max_match_len = mf->params.max_window_size;
+
+	return true;
 }
 
-extern void
-lz_bt_skip_positions(struct lz_bt *mf, unsigned n);
+static void
+lz_null_load_window(struct lz_mf *mf, const u8 window[], u32 size)
+{
+}
 
-extern void
-lz_bt_destroy(struct lz_bt *mf);
+static u32
+lz_null_get_matches(struct lz_mf *mf, struct lz_match matches[])
+{
+	mf->cur_window_pos++;
+	return 0;
+}
 
-#endif /* _WIMLIB_LZ_BT_H */
+static void
+lz_null_skip_positions(struct lz_mf *mf, u32 n)
+{
+	mf->cur_window_pos += n;
+}
+
+static void
+lz_null_destroy(struct lz_mf *mf)
+{
+}
+
+const struct lz_mf_ops lz_null_ops = {
+	.params_valid      = lz_null_params_valid,
+	.get_needed_memory = lz_null_get_needed_memory,
+	.init		   = lz_null_init,
+	.load_window       = lz_null_load_window,
+	.get_matches       = lz_null_get_matches,
+	.skip_positions    = lz_null_skip_positions,
+	.destroy           = lz_null_destroy,
+	.struct_size	   = sizeof(struct lz_mf),
+};
include/wimlib/lz_hash.h to include/wimlib/lz_suffix_array_utils.h
--- a/include/wimlib/lz_hash.h
+++ b/include/wimlib/lz_suffix_array_utils.h
@@ -1,30 +1,17 @@
-#ifndef _WIMLIB_LZ_HASH_H
-#define _WIMLIB_LZ_HASH_H
+#ifndef _WIMLIB_LZ_SUFFIX_ARRAY_UTILS_H
+#define _WIMLIB_LZ_SUFFIX_ARRAY_UTILS_H
 
-#include "wimlib/compress_common.h"
+#include "wimlib/types.h"
 
-struct lz_params {
-	unsigned min_match;
-	unsigned max_match;
-	unsigned max_offset;
-	unsigned nice_match;
-	unsigned good_match;
-	unsigned max_chain_len;
-	unsigned max_lazy_match;
-	unsigned too_far;
-};
-
-typedef void (*lz_record_match_t)(unsigned len, unsigned offset, void *ctx);
-typedef void (*lz_record_literal_t)(u8 lit, void *ctx);
+#define BUILD_SA_MIN_TMP_LEN (65536 + 256)
 
 extern void
-lz_analyze_block(const u8 window[restrict],
-		 u32 window_size,
-		 lz_record_match_t record_match,
-		 lz_record_literal_t record_literal,
-		 void *record_ctx,
-		 const struct lz_params *params,
-		 u32 prev_tab[restrict]);
+build_SA(u32 *SA, const u8 *T, u32 n, u32 *tmp);
 
+extern void
+build_ISA(u32 *ISA, const u32 *SA, u32 n);
 
-#endif /* _WIMLIB_LZ_HASH_H  */
+extern void
+build_LCP(u32 *LCP, const u32 *SA, const u32 *ISA, const u8 *T, u32 n);
+
+#endif /* _WIMLIB_LZ_SUFFIX_ARRAY_UTILS_H */
src/lz_bt.c to src/lz_binary_trees.c
--- a/src/lz_bt.c
+++ b/src/lz_binary_trees.c
@@ -1,13 +1,32 @@
 /*
- * lz_bt.c
+ * lz_binary_trees.c
  *
  * Binary tree match-finder for Lempel-Ziv compression.
  *
- * Author:  Eric Biggers
- * Year:    2014
- *
- * The author dedicates this file to the public domain.
- * You can do whatever you want with this file.
+ * Copyright (c) 2014 Eric Biggers.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
@@ -19,16 +38,35 @@
 #  include "config.h"
 #endif
 
-#include "wimlib/lz.h"
-#include "wimlib/lz_bt.h"
+#include "wimlib/lz_mf.h"
 #include "wimlib/util.h"
+#include <pthread.h>
 #include <string.h>
-#include <pthread.h>
-
-#define LZ_BT_HASH_BITS		16
-#define LZ_BT_HASH_SIZE		(1 << LZ_BT_HASH_BITS)
-#define LZ_BT_HASH_MASK		(LZ_BT_HASH_SIZE - 1)
-#define LZ_BT_DIGRAM_TAB_SIZE	(256 * 256)
+
+/* Number of hash buckets.  This can be changed, but it should be a power of 2
+ * so that the correct hash bucket can be selected using a fast bitwise AND.  */
+#define LZ_BT_HASH_LEN		(1 << 16)
+
+/* Number of bytes from which the hash code is computed at each position.  This
+ * can be changed, provided that lz_bt_hash() is updated as well.  */
+#define LZ_BT_HASH_BYTES   3
+
+/* Number of entries in the digram table.
+ *
+ * Note:  You rarely get length-2 matches if you use length-3 hashing.  But
+ * since binary trees are typically used for higher compression ratios than hash
+ * chains, it is helpful for this match-finder to find length-2 matches as well.
+ * Therefore this match-finder also uses a digram table to find length-2 matches
+ * when the minimum match length is 2.  */
+#define LZ_BT_DIGRAM_TAB_LEN	(256 * 256)
+
+struct lz_bt {
+	struct lz_mf base;
+	u32 *hash_tab;
+	u32 *digram_tab;
+	u32 *child_tab;
+	u32 next_hash;
+};
 
 static u32 crc32_table[256];
 static pthread_once_t crc32_table_filled = PTHREAD_ONCE_INIT;
@@ -48,14 +86,9 @@
         }
 }
 
-/*
- * Compute the hash code for the next 3-byte sequence in the window.
- *
- * @p
- *	A pointer to the next 3-byte sequence in the window.
- *
- * Returns the resulting hash code.
- */
+/* This hash function is taken from the LZMA SDK.  It seems to work well.
+
+ * TODO: Maybe use the SSE4.2 CRC32 instruction when available?  */
 static inline u32
 lz_bt_hash(const u8 *p)
 {
@@ -65,89 +98,75 @@
 	hash ^= p[1];
 	hash ^= (u32)p[2] << 8;
 
-	return hash & LZ_BT_HASH_MASK;
-}
-
-/*
- * Compute the number of bytes of memory that would be needed to initialize a
- * binary tree match-finder with the specified maximum window size.
- *
- * @max_window_size
- *	The maximum window size, in bytes, to query.
- *
- * Returns the number of bytes that would be allocated by lz_bt_init(),
- * excluding the size of the 'struct lz_bt' itself.
- */
-u64
-lz_bt_get_needed_memory(lz_bt_pos_t max_window_size)
-{
-	u64 len;
-
-	len = LZ_BT_HASH_SIZE + LZ_BT_DIGRAM_TAB_SIZE;
-	len += 2 * (u64)max_window_size;
-
-	return len * sizeof(lz_bt_pos_t);
-}
-
-/*
- * Initialize a binary tree match-finder.
- *
- * @mf
- *	The match-finder structure to initialize.
- * @max_window_size
- *	The maximum window size that shall be supported by subsequent calls to
- *	lz_bt_load_window().
- * @min_match_len
- *	The minimum length of matches that shall be produced by subsequent calls
- *	to lz_bt_get_matches().  This must be at least 2.
- * @max_match_len
- *	The maximum length of matches that shall be produced by subsequent calls
- *	to lz_bt_get_matches().  This must be at least @min_match_len.
- * @num_fast_bytes
- *	The maximum length of matches that shall be produced just using the
- *	binary tree search algorithm.  If the longest match has this length,
- *	then lz_bt_get_matches() will extend it up to @max_match_len.  This must
- *	be at least @min_match_len and no more than @max_match_len.
- * @max_search_depth
- *	The maximum depth to descend into the binary search tree before halting
- *	the search.
- *
- * Returns %true if successful; %false if out of memory.
- */
-bool
-lz_bt_init(struct lz_bt *mf,
-	   lz_bt_pos_t max_window_size,
-	   lz_bt_len_t min_match_len,
-	   lz_bt_len_t max_match_len,
-	   lz_bt_len_t num_fast_bytes,
-	   u32 max_search_depth)
-{
-	u64 len;
-
-	/* Check and set parameters.  */
-	LZ_ASSERT(min_match_len >= 2);
-	LZ_ASSERT(max_match_len >= min_match_len);
-	LZ_ASSERT(num_fast_bytes >= min_match_len);
-	LZ_ASSERT(num_fast_bytes <= max_match_len);
-
-	mf->max_window_size = max_window_size;
-	mf->min_match_len = min_match_len;
-	mf->max_match_len = max_match_len;
-	mf->num_fast_bytes = num_fast_bytes;
-	mf->max_search_depth = max_search_depth;
+	return hash % LZ_BT_HASH_LEN;
+}
+
+static void
+lz_bt_set_default_params(struct lz_mf_params *params)
+{
+	if (params->min_match_len == 0)
+		params->min_match_len = 2;
+
+	if (params->max_match_len == 0)
+		params->max_match_len = params->max_window_size;
+
+	if (params->max_search_depth == 0)
+		params->max_search_depth = 50;
+
+	if (params->nice_match_len == 0)
+		params->nice_match_len = 24;
+
+	if (params->nice_match_len < params->min_match_len)
+		params->nice_match_len = params->min_match_len;
+
+	if (params->nice_match_len > params->max_match_len)
+		params->nice_match_len = params->max_match_len;
+}
+
+static bool
+lz_bt_params_valid(const struct lz_mf_params *params)
+{
+	return true;
+}
+
+static u64
+lz_bt_get_needed_memory(u32 max_window_size)
+{
+	u64 len = 0;
+
+	len += LZ_BT_HASH_LEN;		 /* hash_tab */
+	len += LZ_BT_DIGRAM_TAB_LEN;	 /* digram_tab */
+	len += 2 * (u64)max_window_size; /* child_tab */
+
+	return len * sizeof(u32);
+}
+
+static bool
+lz_bt_init(struct lz_mf *_mf)
+{
+	struct lz_bt *mf = (struct lz_bt *)_mf;
+	struct lz_mf_params *params = &mf->base.params;
+	size_t len = 0;
+
+	lz_bt_set_default_params(params);
 
 	/* Allocate space for 'hash_tab', 'digram_tab', and 'child_tab'.  */
-	len = LZ_BT_HASH_SIZE + (2 * (u64)max_window_size);
-	if (mf->min_match_len <= 2)
-		len += LZ_BT_DIGRAM_TAB_SIZE;
-	len *= sizeof(lz_bt_pos_t);
-	if ((size_t)len != len || !(mf->hash_tab = MALLOC(len)))
+
+	len += LZ_BT_HASH_LEN;
+	if (params->min_match_len == 2)
+		len += LZ_BT_DIGRAM_TAB_LEN;
+	len += 2 * params->max_window_size;
+
+	mf->hash_tab = MALLOC(len * sizeof(u32));
+	if (!mf->hash_tab)
 		return false;
-	if (mf->min_match_len <= 2) {
-		mf->digram_tab = mf->hash_tab + LZ_BT_HASH_SIZE;
-		mf->child_tab = mf->digram_tab + LZ_BT_DIGRAM_TAB_SIZE;
+
+	if (params->min_match_len == 2) {
+		mf->digram_tab = mf->hash_tab + LZ_BT_HASH_LEN;
+		mf->child_tab = mf->digram_tab + LZ_BT_DIGRAM_TAB_LEN;
 	} else {
-		mf->child_tab = mf->hash_tab + LZ_BT_HASH_SIZE;
+		mf->digram_tab = NULL;
+		mf->child_tab = mf->hash_tab + LZ_BT_HASH_LEN;
 	}
 
 	/* Fill in the CRC32 table if not done already.  */
@@ -156,47 +175,21 @@
 	return true;
 }
 
-/*
- * Destroy a binary tree match-finder.
- *
- * @mf
- *	The match-finder structure to destroy.
- */
-void
-lz_bt_destroy(struct lz_bt *mf)
-{
-	FREE(mf->hash_tab);
-	/* mf->hash_tab shares storage with mf->digram_tab and mf->child_tab. */
-}
-
-/*
- * Load a window into a binary tree match-finder.
- *
- * @mf
- *	The match-finder structure into which to load the window.
- * @window
- *	Pointer to the window to load.  This memory must remain available,
- *	unmodified, while the match-finder is being used.
- * @window_size
- *	The size of the window, in bytes.  This can't be larger than the
- *	@max_window_size with which lz_bt_init() was called.
- */
-void
-lz_bt_load_window(struct lz_bt *mf, const u8 *window, lz_bt_pos_t window_size)
-{
-	LZ_ASSERT(window_size <= mf->max_window_size);
+static void
+lz_bt_load_window(struct lz_mf *_mf, const u8 window[], u32 size)
+{
+	struct lz_bt *mf = (struct lz_bt *)_mf;
 	size_t clear_len;
 
-	mf->cur_window = window;
-	mf->cur_window_pos = 0;
-	mf->cur_window_size = window_size;
-
-	/* Clear the hash and digram tables.
-	 * Note: The child table need not be cleared.  */
-	clear_len = LZ_BT_HASH_SIZE;
-	if (mf->min_match_len <= 2)
-		clear_len += LZ_BT_DIGRAM_TAB_SIZE;
-	memset(mf->hash_tab, 0, clear_len * sizeof(lz_bt_pos_t));
+	/* Clear hash_tab and digram_tab.
+	 * Note: child_tab need not be cleared.  */
+	clear_len = LZ_BT_HASH_LEN;
+	if (mf->digram_tab)
+		clear_len += LZ_BT_DIGRAM_TAB_LEN;
+	memset(mf->hash_tab, 0, clear_len * sizeof(u32));
+
+	if (size >= LZ_BT_HASH_BYTES)
+		mf->next_hash = lz_bt_hash(window);
 }
 
 /*
@@ -207,13 +200,6 @@
  *	The window being searched.
  * @cur_window_pos
  *	The current position in the window.
- * @min_len
- *	Ignore matches shorter than this length.  This must be at least 1.
- * @max_len
- *	Don't produce any matches longer than this length.  If we find a match
- *	this long, terminate the search and return.
- * @max_depth
- *	Stop if we reach this depth in the binary tree.
  * @child_tab
  *	Table of child pointers for the binary tree.  The children of the node
  *	for position 'i' in the window are child_tab[i * 2] and child_tab[i*2 +
@@ -225,6 +211,13 @@
  *	The position in the window at which the binary tree for the current hash
  *	code is rooted.  This can be 0, which indicates that the binary tree for
  *	the current hash code is empty.
+ * @min_len
+ *	Ignore matches shorter than this length.  This must be at least 1.
+ * @max_len
+ *	Don't produce any matches longer than this length.  If we find a match
+ *	this long, terminate the search and return.
+ * @max_search_depth
+ *	Stop if we reach this depth in the binary tree.
  * @matches
  *	The array in which to produce the matches.  The matches will be produced
  *	in order of increasing length and increasing offset.  No more than one
@@ -234,14 +227,14 @@
  *
  * Returns the number of matches found and written to @matches.
  */
-static lz_bt_len_t
+static u32
 do_search(const u8 window[restrict],
-	  const lz_bt_pos_t cur_window_pos,
-	  const lz_bt_len_t min_len,
-	  const lz_bt_len_t max_len,
-	  const u32 max_depth,
-	  lz_bt_pos_t child_tab[restrict],
-	  lz_bt_pos_t cur_match,
+	  const u32 cur_window_pos,
+	  u32 child_tab[restrict],
+	  u32 cur_match,
+	  const u32 min_len,
+	  const u32 max_len,
+	  const u32 max_search_depth,
 	  struct lz_match matches[restrict])
 {
 	/*
@@ -327,9 +320,9 @@
 	 * In degenerate cases, the binary tree might become severely
 	 * unbalanced.  To prevent excessive running times, we stop immediately
 	 * (and return any matches that happen to have been found so far) if the
-	 * current depth exceeds @max_depth.  Note that this cutoff can occur
-	 * before the longest match has been found, which is usually bad for the
-	 * compression ratio.
+	 * current depth exceeds @max_search_depth.  Note that this cutoff can
+	 * occur before the longest match has been found, which is usually bad
+	 * for the compression ratio.
 	 *
 	 * ---------------------------------------------------------------------
 	 *
@@ -397,17 +390,17 @@
 	 * contain all valid positions.
 	 */
 
-	lz_bt_len_t num_matches = 0;
-	lz_bt_len_t longest_lt_match_len = 0;
-	lz_bt_len_t longest_gt_match_len = 0;
-	lz_bt_len_t longest_match_len = min_len - 1;
-	lz_bt_pos_t *pending_lt_ptr = &child_tab[cur_window_pos * 2 + 0];
-	lz_bt_pos_t *pending_gt_ptr = &child_tab[cur_window_pos * 2 + 1];
+	u32 num_matches = 0;
+	u32 longest_lt_match_len = 0;
+	u32 longest_gt_match_len = 0;
+	u32 longest_match_len = min_len - 1;
+	u32 *pending_lt_ptr = &child_tab[cur_window_pos * 2 + 0];
+	u32 *pending_gt_ptr = &child_tab[cur_window_pos * 2 + 1];
 	const u8 *strptr = &window[cur_window_pos];
-	u32 depth_remaining = max_depth;
+	u32 depth_remaining = max_search_depth;
 	for (;;) {
 		const u8 *matchptr;
-		lz_bt_len_t len;
+		u32 len;
 
 		if (depth_remaining-- == 0 || cur_match == 0) {
 			*pending_lt_ptr = 0;
@@ -429,7 +422,7 @@
 
 				matches[num_matches++] = (struct lz_match) {
 					.len = len,
-					.offset = cur_window_pos - cur_match,
+					.offset = strptr - matchptr,
 				};
 
 				if (len == max_len) {
@@ -454,146 +447,84 @@
 	}
 }
 
-/*
- * Retrieve a list of matches at the next position in the window.
- *
- * @mf
- *	The binary tree match-finder structure into which a window has been
- *	loaded using lz_bt_load_window().
- * @matches
- *	The array into which the matches will be returned.  The length of this
- *	array must be at least (@mf->num_fast_bytes - @mf->min_match_len + 1).
- *
- * The return value is the number of matches that were found and stored in the
- * 'matches' array.  The matches will be ordered by strictly increasing length
- * and strictly increasing offset.  No match shall have length less than
- * @min_match_len, and no match shall have length greater than @max_match_len.
- * The return value may be 0, which indicates that no matches were found.
- *
- * On completion, the binary tree match-finder is advanced to the next position
- * in the window.
- */
-lz_bt_len_t
-lz_bt_get_matches(struct lz_bt *mf, struct lz_match matches[])
-{
-	lz_bt_pos_t bytes_remaining;
-	lz_bt_len_t num_matches;
-	lz_bt_pos_t cur_match;
+static u32
+lz_bt_get_matches(struct lz_mf *_mf, struct lz_match matches[])
+{
+	struct lz_bt *mf = (struct lz_bt *)_mf;
+	const u32 bytes_remaining = lz_mf_get_bytes_remaining(&mf->base);
 	u32 hash;
-
-	LZ_ASSERT(mf->cur_window_pos < mf->cur_window_size);
-
-	bytes_remaining = lz_bt_get_remaining_size(mf);
-
-	/* If there are fewer than 3 bytes remaining, we can't even compute a
-	 * hash to look up a binary tree root.  If there are exactly 2 bytes
-	 * remaining we could still search for a length-2 match using the digram
-	 * table, but it's not worth bothering.  (Note: this is also useful for
-	 * LZX, since this excludes the length 2 match having the maximum
-	 * offset, which isn't allowed.)  */
-	if (bytes_remaining < 3) {
-		mf->cur_window_pos++;
-		return 0;
-	}
-
-	num_matches = 0;
-
-	/* Search the digram table for a length 2 match.  */
-	if (mf->min_match_len <= 2) {
-		u8 c1, c2;
-		u16 digram;
-
-		c1 = mf->cur_window[mf->cur_window_pos];
-		c2 = mf->cur_window[mf->cur_window_pos + 1];
-		digram = (u16)c1 | ((u16)c2 << 8);
+	u32 cur_match;
+	u32 min_len;
+	u32 num_matches = 0;
+
+	if (bytes_remaining <= LZ_BT_HASH_BYTES)
+		goto out;
+
+	if (mf->digram_tab) {
+		/* Search the digram table for a length 2 match.  */
+
+		const u16 digram = *(const u16 *)lz_mf_get_window_ptr(&mf->base);
 		cur_match = mf->digram_tab[digram];
-		mf->digram_tab[digram] = mf->cur_window_pos;
+		mf->digram_tab[digram] = mf->base.cur_window_pos;
 
 		/* We're only interested in matches of length exactly 2, since
-		 * those won't be found during the binary tree search.  */
-		if (cur_match != 0 && mf->cur_window[cur_match + 2] !=
-				      mf->cur_window[mf->cur_window_pos + 2])
+		 * those won't be found during the binary tree search.
+		 *
+		 * Note: it's possible to extend this match as much as possible,
+		 * then use its length plus 1 as min_len for the binary tree
+		 * search.  However I found this actually *reduced* performance
+		 * slightly, evidently because the binary tree still needs to be
+		 * searched/updated starting from the root in either case.  */
+		if (cur_match != 0 &&
+		    (mf->base.cur_window[cur_match + 2] !=
+		     mf->base.cur_window[mf->base.cur_window_pos + 2]))
 		{
 			matches[num_matches++] = (struct lz_match) {
 				.len = 2,
-				.offset = mf->cur_window_pos - cur_match,
+				.offset = mf->base.cur_window_pos - cur_match,
 			};
 		}
+		min_len = 3;
+	} else {
+		min_len = mf->base.params.min_match_len;
 	}
 
-	/* Hash the length-3 byte sequence beginning at the current position in
-	 * the window.  */
-	hash = lz_bt_hash(&mf->cur_window[mf->cur_window_pos]);
-
-	/* The corresponding hash bucket in 'hash_tab' contains the root of the
-	 * binary tree of previous window positions that have the same hash
-	 * code.  */
+	hash = mf->next_hash;
+	mf->next_hash = lz_bt_hash(lz_mf_get_window_ptr(&mf->base) + 1);
+	prefetch(&mf->hash_tab[mf->next_hash]);
 	cur_match = mf->hash_tab[hash];
-
-	/* Update the hash bucket to point to the binary tree rooted at the
-	 * current position, which we will construct in do_search().  */
-	mf->hash_tab[hash] = mf->cur_window_pos;
-
-	/* Search the binary tree for matches.  At the same time, build the
-	 * binary tree rooted at the current position, which replaces the one we
-	 * search.  */
-	num_matches += do_search(mf->cur_window,
-				 mf->cur_window_pos,
-				 max(3, mf->min_match_len),
-				 min(bytes_remaining, mf->num_fast_bytes),
-				 mf->max_search_depth,
+	mf->hash_tab[hash] = mf->base.cur_window_pos;
+
+	/* Search the binary tree of 'hash' for matches while re-rooting it at
+	 * the current position.  */
+	num_matches += do_search(mf->base.cur_window,
+				 mf->base.cur_window_pos,
 				 mf->child_tab,
 				 cur_match,
+				 min_len,
+				 min(bytes_remaining, mf->base.params.nice_match_len),
+				 mf->base.params.max_search_depth,
 				 &matches[num_matches]);
 
-	/* If the longest match is @num_fast_bytes in length, it may have been
+	/* If the longest match is @nice_match_len in length, it may have been
 	 * truncated.  Try extending it up to the maximum match length.  */
-	if (num_matches != 0 && matches[num_matches - 1].len == mf->num_fast_bytes) {
-		lz_bt_pos_t limit;
-		const u8 *strptr, *matchptr;
-		lz_bt_len_t len;
-
-		limit = min(bytes_remaining, mf->max_match_len);
-		strptr = &mf->cur_window[mf->cur_window_pos];
-		matchptr = strptr - matches[num_matches - 1].offset;
+	if (num_matches != 0 &&
+	    matches[num_matches - 1].len == mf->base.params.nice_match_len)
+	{
+		const u8 * const strptr = lz_mf_get_window_ptr(&mf->base);
+		const u8 * const matchptr = strptr - matches[num_matches - 1].offset;
+		const u32 len_limit = min(bytes_remaining, mf->base.params.max_match_len);
+		u32 len;
+
 		len = matches[num_matches - 1].len;
-		while (len < limit && strptr[len] == matchptr[len])
+		while (len < len_limit && strptr[len] == matchptr[len])
 			len++;
 		matches[num_matches - 1].len = len;
 	}
 
-#ifdef ENABLE_LZ_DEBUG
-	/* Check the matches.  */
-	for (lz_bt_len_t i = 0; i < num_matches; i++) {
-		const u8 *matchptr, *strptr;
-
-		/* Length valid?  */
-		LZ_ASSERT(matches[i].len >= mf->min_match_len);
-		LZ_ASSERT(matches[i].len <= min(mf->max_match_len, bytes_remaining));
-
-		/* Offset valid?  */
-		LZ_ASSERT(matches[i].offset >= 1);
-		LZ_ASSERT(matches[i].offset <= lz_bt_get_position(mf));
-
-		/* Lengths and offsets strictly increasing?  */
-		if (i > 0) {
-			LZ_ASSERT(matches[i].len > matches[i - 1].len);
-			LZ_ASSERT(matches[i].offset > matches[i - 1].offset);
-		}
-
-		/* Actually a match?  */
-		strptr = lz_bt_get_window_ptr(mf);
-		matchptr = strptr - matches[i].offset;
-		LZ_ASSERT(!memcmp(strptr, matchptr, matches[i].len));
-
-		/* Match can't be extended further?  */
-		LZ_ASSERT(matches[i].len == min(mf->max_match_len, bytes_remaining) ||
-			  strptr[matches[i].len] != matchptr[matches[i].len]);
-	}
-#endif /* ENABLE_LZ_DEBUG  */
-
-	/* Advance to the next position in the window.  */
-	mf->cur_window_pos++;
+out:
+	/* Advance to the next position.  */
+	mf->base.cur_window_pos++;
 
 	/* Return the number of matches found.  */
 	return num_matches;
@@ -603,20 +534,21 @@
  * See do_search() for explanatory comments.  */
 static void
 do_skip(const u8 window[restrict],
-	const lz_bt_pos_t cur_window_pos,
-	const lz_bt_len_t max_len,
-	u32 depth_remaining,
-	lz_bt_pos_t child_tab[restrict],
-	lz_bt_pos_t cur_match)
-{
-	lz_bt_len_t longest_lt_match_len = 0;
-	lz_bt_len_t longest_gt_match_len = 0;
-	lz_bt_pos_t *pending_lt_ptr = &child_tab[cur_window_pos * 2 + 0];
-	lz_bt_pos_t *pending_gt_ptr = &child_tab[cur_window_pos * 2 + 1];
+	const u32 cur_window_pos,
+	u32 child_tab[restrict],
+	u32 cur_match,
+	const u32 max_len,
+	const u32 max_search_depth)
+{
+	u32 longest_lt_match_len = 0;
+	u32 longest_gt_match_len = 0;
+	u32 *pending_lt_ptr = &child_tab[cur_window_pos * 2 + 0];
+	u32 *pending_gt_ptr = &child_tab[cur_window_pos * 2 + 1];
 	const u8 * const strptr = &window[cur_window_pos];
+	u32 depth_remaining = max_search_depth;
 	for (;;) {
 		const u8 *matchptr;
-		lz_bt_len_t len;
+		u32 len;
 
 		if (depth_remaining-- == 0 || cur_match == 0) {
 			*pending_lt_ptr = 0;
@@ -650,57 +582,68 @@
 	}
 }
 
-/* Skip the current position in the binary tree match-finder.  */
 static void
 lz_bt_skip_position(struct lz_bt *mf)
 {
-	lz_bt_pos_t bytes_remaining;
+	const u32 bytes_remaining = lz_mf_get_bytes_remaining(&mf->base);
 	u32 hash;
-	lz_bt_pos_t cur_match;
-
-	LZ_ASSERT(mf->cur_window_pos < mf->cur_window_size);
-
-	bytes_remaining = lz_bt_get_remaining_size(mf);
-
-	/* As explained in lz_bt_get_matches(), we don't search for matches if
-	 * there are fewer than 3 bytes remaining in the window.  */
-	if (bytes_remaining < 3) {
-		mf->cur_window_pos++;
-		return;
+	u32 cur_match;
+
+	if (bytes_remaining <= LZ_BT_HASH_BYTES)
+		goto out;
+
+	/* Update the digram table.  */
+	if (mf->digram_tab) {
+		const u16 digram = *(const u16 *)lz_mf_get_window_ptr(&mf->base);
+		mf->digram_tab[digram] = mf->base.cur_window_pos;
 	}
 
-	/* Update the digram table.  */
-	if (mf->min_match_len <= 2) {
-		u8 c1, c2;
-		u16 digram;
-
-		c1 = mf->cur_window[mf->cur_window_pos];
-		c2 = mf->cur_window[mf->cur_window_pos + 1];
-		digram = (u16)c1 | ((u16)c2 << 8);
-		mf->digram_tab[digram] = mf->cur_window_pos;
-	}
-
 	/* Update the hash table.  */
-	hash = lz_bt_hash(&mf->cur_window[mf->cur_window_pos]);
+	hash = mf->next_hash;
+	mf->next_hash = lz_bt_hash(lz_mf_get_window_ptr(&mf->base) + 1);
+	prefetch(&mf->hash_tab[mf->next_hash]);
 	cur_match = mf->hash_tab[hash];
-	mf->hash_tab[hash] = mf->cur_window_pos;
+	mf->hash_tab[hash] = mf->base.cur_window_pos;
 
 	/* Update the binary tree for the appropriate hash code.  */
-	do_skip(mf->cur_window,
-		mf->cur_window_pos,
-		min(bytes_remaining, mf->num_fast_bytes),
-		mf->max_search_depth,
+	do_skip(mf->base.cur_window,
+		mf->base.cur_window_pos,
 		mf->child_tab,
-		cur_match);
-
+		cur_match,
+		min(bytes_remaining, mf->base.params.nice_match_len),
+		mf->base.params.max_search_depth);
+
+out:
 	/* Advance to the next position.  */
-	mf->cur_window_pos++;
-}
-
-/* Skip 'n' positions in the binary tree match-finder.  */
-void
-lz_bt_skip_positions(struct lz_bt *mf, unsigned n)
-{
-	while (n--)
+	mf->base.cur_window_pos++;
+}
+
+static void
+lz_bt_skip_positions(struct lz_mf *_mf, u32 n)
+{
+	struct lz_bt *mf = (struct lz_bt *)_mf;
+
+	do {
 		lz_bt_skip_position(mf);
-}
+	} while (--n);
+}
+
+static void
+lz_bt_destroy(struct lz_mf *_mf)
+{
+	struct lz_bt *mf = (struct lz_bt *)_mf;
+
+	FREE(mf->hash_tab);
+	/* mf->hash_tab shares storage with mf->digram_tab and mf->child_tab. */
+}
+
+const struct lz_mf_ops lz_binary_trees_ops = {
+	.params_valid      = lz_bt_params_valid,
+	.get_needed_memory = lz_bt_get_needed_memory,
+	.init		   = lz_bt_init,
+	.load_window       = lz_bt_load_window,
+	.get_matches       = lz_bt_get_matches,
+	.skip_positions    = lz_bt_skip_positions,
+	.destroy           = lz_bt_destroy,
+	.struct_size	   = sizeof(struct lz_bt),
+};
src/lz_hash.c to src/lz_mf.c
--- a/src/lz_hash.c
+++ b/src/lz_mf.c
@@ -1,313 +1,348 @@
 /*
- * lz_hash.c
- *
- * This file provides the code to analyze a buffer of uncompressed data for
- * matches, as per the LZ77 algorithm.  It uses a hash table to accelerate the
- * process.  This is based on code from zlib (v. 1.2.5).
- */
-
-/*
- * Copyright (C) 2012, 2013 Eric Biggers
- * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
- *
- * This file is part of wimlib, a library for working with WIM files.
- *
- * wimlib is free software; you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 3 of the License, or (at your option)
- * any later version.
- *
- * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- * A PARTICULAR PURPOSE. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with wimlib; if not, see http://www.gnu.org/licenses/.
+ * lz_mf.c
+ *
+ * Interface for Lempel-Ziv match-finders.
+ *
+ * Copyright (c) 2014 Eric Biggers.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #ifdef HAVE_CONFIG_H
-#  include <config.h>
+#  include "config.h"
 #endif
 
-#include "wimlib/lz_hash.h"
+#include "wimlib/lz_mf.h"
+#include "wimlib/lz_mf_ops.h"
 #include "wimlib/util.h"
 
-#include <string.h>
-
-#define HASH_BITS	15
-#define HASH_SIZE	(1 << HASH_BITS)
-#define HASH_MASK	(HASH_SIZE - 1)
-#define HASH_SHIFT	5
-
-/* Hash function, based on code from zlib.  This function will update and return
- * the hash value @hash for the string ending on the additional input character
- * @c.  This function must be called for each consecutive character, because it
- * uses a running hash value rather than computing it separately for each
- * 3-character string.
- *
- * The AND operation guarantees that only 3 characters will affect the hash
- * value, so every identical 3-character string will have the same hash value.
- */
-static inline unsigned
-update_hash(unsigned hash, u8 c)
-{
-	return ((hash << HASH_SHIFT) ^ c) & HASH_MASK;
-}
-
-
-/* Insert a 3-character string at position @str_pos in @window and with hash
- * code @hash into the hash table described by @hash_tab and @prev_tab.  Based
- * on code from zlib.
- *
- * The hash table uses chains (linked lists) for the hash buckets, but there are
- * no real pointers involved.  Indexing `hash_tab' by hash value gives the index
- * within the window of the last string in the hash bucket.  To find the index
- * of the previous string in the hash chain, the `prev_tab' array is indexed by
- * the string index.  `prev_tab' can be indexed repeatedly by the string index
- * to walk through the hash chain, until the special index `0' is reached,
- * indicating the end of the hash chain.
- */
-static inline unsigned
-insert_string(u32 hash_tab[], u32 prev_tab[],
-	      const u8 window[], unsigned str_pos,
-	      unsigned hash)
-{
-	hash = update_hash(hash, window[str_pos + 2]);
-	prev_tab[str_pos] = hash_tab[hash];
-	hash_tab[hash] = str_pos;
-	return hash;
-}
-
-
-/*
- * Returns the longest match for a given input position.
- *
- * @window:		The window of uncompressed data.
- * @bytes_remaining:	The number of bytes remaining in the window.
- * @strstart:		The index of the start of the string in the window that
- *				we are trying to find a match for.
- * @prev_tab:		The array of prev pointers for the hash table.
- * @cur_match:		The index of the head of the hash chain for matches
- *				having the hash value of the string beginning
- *				at index @strstart.
- * @prev_len:		The length of the match that was found for the string
- *				beginning at (@strstart - 1).
- * @match_start_ret:	A location into which the index of the start of the
- *				match will be returned.
- * @params:		Parameters that affect how long the search will proceed
- *				before going with the best that has been found
- *				so far.
- * @min_start_pos:	If the chain reaches a match starting before this
- *			position (including the end-of-chain 0), the search will
- *			be terminated.
- *
- * Returns the length of the match that was found.
- */
-static unsigned
-longest_match(const u8 window[], unsigned bytes_remaining,
-	      unsigned strstart, const u32 prev_tab[],
-	      unsigned cur_match, unsigned prev_len,
-	      unsigned *match_start_ret,
-	      const struct lz_params *params,
-	      unsigned min_start_pos)
-{
-	unsigned chain_len = params->max_chain_len;
-
-	const u8 *scan = window + strstart;
-	const u8 *match;
-	unsigned len;
-	unsigned best_len = prev_len;
-	unsigned match_start = cur_match;
-
-	unsigned nice_match = min(params->nice_match, bytes_remaining);
-
-	const u8 *strend = scan + min(params->max_match, bytes_remaining);
-
-	u8 scan_end1 = scan[best_len - 1];
-	u8 scan_end = scan[best_len];
-
-
-	/* Do not waste too much time if we already have a good match: */
-	if (best_len >= params->good_match)
-		chain_len >>= 2;
-
-	do {
-		match = &window[cur_match];
-
-		/* Skip to next match if the match length cannot increase or if
-		 * the match length is less than 2.  Note that the checks below
-		 * for insufficient lookahead only occur occasionally for
-		 * performance reasons.  Therefore uninitialized memory will be
-		 * accessed, and conditional jumps will be made that depend on
-		 * those values.  However the length of the match is limited to
-		 * the lookahead, so the output of lz_analyze_block() is not
-		 * affected by the uninitialized values.  */
-
-		if (match[best_len] != scan_end
-		    || match[best_len - 1] != scan_end1
-		    || *match != *scan
-		    || *++match != scan[1])
-			continue;
-		scan++;
-
-	#if 0
-		do {
-		} while (scan < strend && *++match == *++scan);
-	#else
-
-		do {
-		} while (
-			 *++match == *++scan && *++match == *++scan &&
-			 *++match == *++scan && *++match == *++scan &&
-			 *++match == *++scan && *++match == *++scan &&
-			 *++match == *++scan && *++match == *++scan &&
-			 scan < strend);
+/* Available match-finding algorithms.  */
+static const struct lz_mf_ops *mf_ops[] = {
+	[LZ_MF_NULL]			= &lz_null_ops,
+	[LZ_MF_BRUTE_FORCE]		= &lz_brute_force_ops,
+	[LZ_MF_HASH_CHAINS]		= &lz_hash_chains_ops,
+	[LZ_MF_BINARY_TREES]		= &lz_binary_trees_ops,
+	[LZ_MF_LCP_INTERVAL_TREE]	= &lz_lcp_interval_tree_ops,
+	[LZ_MF_LINKED_SUFFIX_ARRAY]	= &lz_linked_suffix_array_ops,
+};
+
+/*
+ * Automatically select a match-finding algorithm to use, in the case that the
+ * user did not specify one.
+ */
+static const struct lz_mf_ops *
+select_mf_ops(enum lz_mf_algo algorithm, u32 max_window_size)
+{
+	if (algorithm == LZ_MF_DEFAULT) {
+		if (max_window_size <= 32768)
+			algorithm = LZ_MF_HASH_CHAINS;
+		else if (max_window_size <= 2097152)
+			algorithm = LZ_MF_BINARY_TREES;
+		else if (max_window_size <= 33554432)
+			algorithm = LZ_MF_LCP_INTERVAL_TREE;
+		else
+			algorithm = LZ_MF_LINKED_SUFFIX_ARRAY;
+	}
+	if ((int)algorithm < 0 || (int)algorithm >= ARRAY_LEN(mf_ops))
+		return NULL;
+	return mf_ops[(int)algorithm];
+}
+
+/*
+ * Returns an upper bound on the number of bytes of memory that will be consumed
+ * by a match-finder allocated with the specified algorithm and maximum window
+ * size.
+ *
+ * The returned value does not include the size of the window itself.  The
+ * caller must account for this separately if needed.
+ *
+ * If @algorithm is invalid, returns 0.
+ */
+u64
+lz_mf_get_needed_memory(enum lz_mf_algo algorithm, u32 max_window_size)
+{
+	const struct lz_mf_ops *ops;
+
+	ops = select_mf_ops(algorithm, max_window_size);
+	if (!ops)
+		return 0;
+	return ops->struct_size + ops->get_needed_memory(max_window_size);
+}
+/*
+ * Returns %true if and only if the specified parameters can be validly used to
+ * create a match-finder using lz_mf_alloc().
+ */
+bool
+lz_mf_params_valid(const struct lz_mf_params *params)
+{
+	const struct lz_mf_ops *ops;
+
+	/* Require that a valid algorithm, or LZ_MF_DEFAULT, be specified.  */
+	ops = select_mf_ops(params->algorithm, params->max_window_size);
+	if (!ops)
+		return false;
+
+	/* Don't allow empty windows.  Otherwise, some match-finding algorithms
+	 * might need special-case code to handle empty windows.  */
+	if (params->max_window_size == 0)
+		return false;
+
+	/* Don't allow length-1 matches, so that match-finding algorithms don't
+	 * need to worry about this case.  Most LZ-based compression formats
+	 * don't allow length-1 matches, since they usually aren't helpful for
+	 * compression.  Also, if a compressor really does need length-1
+	 * matches, it can easily maintain its own table of length 256
+	 * containing the most-recently-seen position for each byte value.
+	 *
+	 * min_match_len == 0 is valid, since that means the match-finding
+	 * algorithm will fill in a default value.  */
+	if (params->min_match_len == 1)
+		return false;
+
+	if (params->max_match_len != 0) {
+
+		/* Don't allow length-1 matches (same reason as above).  */
+		if (params->max_match_len == 1)
+			return false;
+
+		/* Don't allow the maximum match length to be shorter than the
+		 * minimum match length.  */
+		if (params->max_match_len < params->min_match_len)
+			return false;
+	}
+
+	/* Don't allow the needed memory size to overflow a 'size_t'.  */
+	if (sizeof(size_t) < sizeof(u64)) {
+		u64 needed_mem = ops->get_needed_memory(params->max_window_size);
+		if ((size_t)needed_mem != needed_mem)
+			return false;
+	}
+
+	/* Call the algorithm-specific routine to finish the validation.  */
+	return ops->params_valid(params);
+}
+
+/*
+ * Allocate a new match-finder.
+ *
+ * @params
+ *	The parameters for the match-finder.  See the declaration of 'struct
+ *	lz_mf_params' for more information.
+ *
+ * Returns a pointer to the new match-finder, or NULL if out of memory or the
+ * parameters are invalid.  Call lz_mf_params_valid() beforehand to test the
+ * parameter validity separately.
+ */
+struct lz_mf *
+lz_mf_alloc(const struct lz_mf_params *params)
+{
+	struct lz_mf *mf;
+	const struct lz_mf_ops *ops;
+
+	/* Validate the parameters.  */
+	if (!lz_mf_params_valid(params))
+		return NULL;
+
+	/* Get the match-finder operations structure.  Since we just validated
+	 * the parameters, this is guaranteed to return a valid structure.  */
+	ops = select_mf_ops(params->algorithm, params->max_window_size);
+	LZ_ASSERT(ops != NULL);
+
+	/* Allocate memory for the match-finder structure.  */
+	LZ_ASSERT(ops->struct_size >= sizeof(struct lz_mf));
+	mf = CALLOC(1, ops->struct_size);
+	if (!mf)
+		return NULL;
+
+	/* Set the parameters and operations fields.  */
+	mf->params = *params;
+	mf->ops = *ops;
+
+	/* Perform algorithm-specific initialization.  Normally this is where
+	 * most of the necessary memory is allocated.  */
+	if (!mf->ops.init(mf)) {
+		FREE(mf);
+		return NULL;
+	}
+
+	/* The algorithm must have set min_match_len and max_match_len if either
+	 * was 0.  */
+	LZ_ASSERT(mf->params.min_match_len >= 2);
+	LZ_ASSERT(mf->params.max_match_len >= mf->params.min_match_len);
+
+	return mf;
+}
+
+/*
+ * Load a window into the match-finder.
+ *
+ * @mf
+ *	The match-finder into which to load the window.
+ * @window
+ *	Pointer to the window to load.  This memory must remain available,
+ *	unmodified, while the match-finder is being used.
+ * @size
+ *	The size of the window, in bytes.  This can't be larger than the
+ *	@max_window_size parameter.  In addition, this can't be 0.
+ *
+ * Note: this interface does not support sliding windows!
+ */
+void
+lz_mf_load_window(struct lz_mf *mf, const u8 *window, u32 size)
+{
+	/* Can't be an empty window, and can't be larger than the maximum window
+	 * size with which the match-finder was allocated.  */
+	LZ_ASSERT(size > 0);
+	LZ_ASSERT(size <= mf->params.max_window_size);
+
+	/* Save the window and initialize the current position.  */
+	mf->cur_window = window;
+	mf->cur_window_size = size;
+	mf->cur_window_pos = 0;
+
+	/* Call into the algorithm-specific window load code.  */
+	mf->ops.load_window(mf, window, size);
+}
+
+/*
+ * Retrieve a list of matches at the next position in the window.
+ *
+ * @mf
+ *	The match-finder into which a window has been loaded using
+ *	lz_mf_load_window().
+ * @matches
+ *	The array into which the matches will be returned.  The returned match
+ *	count will not exceed the minimum of @max_search_depth and (@len_limit -
+ *	@min_match_len + 1), where @len_limit is itself defined as
+ *	min(@max_match_len, @nice_match_len).
+ *
+ * The return value is the number of matches that were found and stored in the
+ * 'matches' array.  The matches will be ordered by strictly increasing length
+ * and strictly increasing offset.  No match shall have length less than
+ * @min_match_len, and no match shall have length greater than @max_match_len.
+ * The return value may be 0, which indicates that no matches were found.
+ *
+ * On completion, the match-finder is advanced to the next position in the
+ * window.
+ *
+ * Note: in-non-debug mode, the inline definition of this gets used instead.
+ * They are the same, except that the non-inline version below validates the
+ * results to help debug match-finding algorithms.
+ */
+#ifdef ENABLE_LZ_DEBUG
+u32
+lz_mf_get_matches(struct lz_mf *mf, struct lz_match *matches)
+{
+	LZ_ASSERT(mf->cur_window_pos < mf->cur_window_size);
+
+	const u32 orig_pos = mf->cur_window_pos;
+	const u32 len_limit = min(mf->params.max_match_len,
+				  lz_mf_get_bytes_remaining(mf));
+	const u8 * const strptr = lz_mf_get_window_ptr(mf);
+
+	const u32 num_matches = mf->ops.get_matches(mf, matches);
+
+	LZ_ASSERT(mf->cur_window_pos == orig_pos + 1);
+
+#if 0
+	fprintf(stderr, "Pos %"PRIu32"/%"PRIu32": %"PRIu32" matches\n",
+		orig_pos, mf->cur_window_size, num_matches);
+	for (u32 i = 0; i < num_matches; i++) {
+		fprintf(stderr, "\tLen %"PRIu32" Offset %"PRIu32"\n",
+			matches[i].len, matches[i].offset);
+	}
+#endif
+
+	/* Validate the matches.  */
+	for (u32 i = 0; i < num_matches; i++) {
+		const u32 len = matches[i].len;
+		const u32 offset = matches[i].offset;
+		const u8 *matchptr;
+
+		/* Length valid?  */
+		LZ_ASSERT(len >= mf->params.min_match_len);
+		LZ_ASSERT(len <= len_limit);
+
+		/* Offset valid?  */
+		LZ_ASSERT(offset >= 1);
+		LZ_ASSERT(offset <= orig_pos);
+
+		/* Lengths and offsets strictly increasing?  */
+		if (i > 0) {
+			LZ_ASSERT(len > matches[i - 1].len);
+			LZ_ASSERT(offset > matches[i - 1].offset);
+		}
+
+		/* Actually a match?  */
+		matchptr = strptr - offset;
+		LZ_ASSERT(!memcmp(strptr, matchptr, len));
+
+		/* Match can't be extended further?  */
+		LZ_ASSERT(len == len_limit || strptr[len] != matchptr[len]);
+	}
+
+	return num_matches;
+}
+#endif /* ENABLE_LZ_DEBUG */
+
+/*
+ * Skip 'n' positions in the match-finder.  This is a faster alternative to
+ * calling lz_mf_get_matches() at each position to advance the match-finder.
+ *
+ * 'n' must be greater than 0.
+ *
+ * Note: in-non-debug mode, the inline definition of this gets used instead.
+ * They are the same, except the non-inline version below does extra checks.
+ */
+#ifdef ENABLE_LZ_DEBUG
+void
+lz_mf_skip_positions(struct lz_mf *mf, const u32 n)
+{
+	LZ_ASSERT(n > 0);
+	LZ_ASSERT(n <= lz_mf_get_bytes_remaining(mf));
+
+	const u32 orig_pos = mf->cur_window_pos;
+
+	mf->ops.skip_positions(mf, n);
+
+	LZ_ASSERT(mf->cur_window_pos == orig_pos + n);
+}
+#endif
+
+/*
+ * Free the match-finder.
+ *
+ * This frees all memory that was allocated by the call to lz_mf_alloc().
+ */
+void
+lz_mf_free(struct lz_mf *mf)
+{
+	if (mf) {
+		mf->ops.destroy(mf);
+	#ifdef ENABLE_LZ_DEBUG
+		memset(mf, 0, mf->ops.struct_size);
 	#endif
-		len = match - &window[cur_match];
-
-		scan = &window[strstart];
-
-		if (len > best_len) {
-			match_start = cur_match;
-			best_len = len;
-			if (len >= nice_match)
-				break;
-			scan_end1  = scan[best_len - 1];
-			scan_end   = scan[best_len];
-		}
-	} while (--chain_len != 0 && (cur_match = prev_tab[cur_match]) >= min_start_pos);
-	*match_start_ret = match_start;
-	return min(min(best_len, bytes_remaining), params->max_match);
-}
-
-
-
-/*
- * Determines the sequence of matches and literals that a block of data will be
- * compressed to.
- *
- * @window:		The data that is to be compressed.
- * @window_size:	The length of @window, in bytes.
- * @record_match:	Consumer for matches.
- * @record_literal:	Consumer for literals.
- * @record_ctx:		Context passed to @record_match and @record_literal.
- * @params:		Structure that contains parameters that affect how the
- *				analysis proceeds (mainly how good the matches
- *				have to be).
- * @prev_tab:		Temporary space containing least @window_size elements.
- */
-void
-lz_analyze_block(const u8 window[restrict],
-		 u32 window_size,
-		 lz_record_match_t record_match,
-		 lz_record_literal_t record_literal,
-		 void *record_ctx,
-		 const struct lz_params *params,
-		 u32 prev_tab[restrict])
-{
-	unsigned cur_input_pos = 0;
-	unsigned hash          = 0;
-	unsigned hash_head     = 0;
-	unsigned prev_len      = params->min_match - 1;
-	unsigned prev_start;
-	unsigned match_len     = params->min_match - 1;
-	unsigned match_start   = 0;
-	bool match_available = false;
-	u32 hash_tab[HASH_SIZE];
-	unsigned min_start_pos = 1;
-
-	ZERO_ARRAY(hash_tab);
-
-	do {
-		/* If there are at least 3 characters remaining in the input,
-		 * insert the 3-character string beginning at
-		 * window[cur_input_pos] into the hash table.
-		 *
-		 * hash_head is set to the index of the previous string in the
-		 * hash bucket, or 0 if there is no such string */
-		if (window_size - cur_input_pos >= params->min_match) {
-			hash = insert_string(hash_tab, prev_tab,
-					     window,
-					     cur_input_pos, hash);
-			hash_head = prev_tab[cur_input_pos];
-		} else {
-			hash_head = 0;
-		}
-
-
-		/* Find the longest match, discarding those <= prev_len. */
-		prev_len = match_len;
-		prev_start = match_start;
-		match_len = params->min_match - 1;
-
-		if (cur_input_pos > params->max_offset)
-			min_start_pos = cur_input_pos - params->max_offset;
-		else
-			min_start_pos = 1;
-
-		if (hash_head >= min_start_pos &&
-		    prev_len < params->max_lazy_match)
-		{
-			/* To simplify the code, we prevent matches with the
-			 * string of window index 0 (in particular we have to
-			 * avoid a match of the string with itself at the start
-			 * of the input file).  */
-			match_len = longest_match(window,
-						  window_size - cur_input_pos,
-						  cur_input_pos, prev_tab,
-						  hash_head, prev_len,
-						  &match_start, params,
-						  min_start_pos);
-
-			if (match_len == params->min_match &&
-			     cur_input_pos - match_start > params->too_far)
-				match_len = params->min_match - 1;
-		}
-
-		/* If there was a match at the previous step and the current
-		 * match is not better, output the previous match:
-		 */
-		if (prev_len >= params->min_match && match_len <= prev_len) {
-
-
-			(*record_match)(prev_len,
-					cur_input_pos - 1 - prev_start,
-					record_ctx);
-
-			/* Insert in hash table all strings up to the end of the
-			 * match.  strstart-1 and strstart are already inserted.
-			 * If there is not enough lookahead, the last two
-			 * strings are not inserted in the hash table.  */
-
-			/* Do not insert strings in hash table beyond this. */
-			unsigned max_insert = window_size - params->min_match;
-
-			prev_len -= 2;
-
-			do {
-				if (++cur_input_pos <= max_insert) {
-					hash = insert_string(hash_tab, prev_tab,
-							     window,
-							     cur_input_pos,
-							     hash);
-				}
-			} while (--prev_len != 0);
-			match_available = false;
-			match_len = params->min_match - 1;
-		} else if (match_available) {
-			/* If there was no match at the previous position,
-			 * output a single literal. If there was a match but the
-			 * current match is longer, truncate the previous match
-			 * to a single literal.  */
-			(*record_literal)(window[cur_input_pos - 1], record_ctx);
-		} else {
-			/* There is no previous match to compare with, wait for
-			 * the next step to decide.  */
-			match_available = true;
-		}
-	} while (++cur_input_pos < window_size);
-
-	if (match_available)
-		(*record_literal)(window[cur_input_pos - 1], record_ctx);
-}
+		FREE(mf);
+	}
+}
1 2 3 > >> (Page 1 of 3)