7-Zip / Bugs / #2571 .tar.gz random access

Here is PoC in python using awesome indexed_gzip library:

import indexed_gzip as igzip
import tarfile
import json
import os
import argparse
import time

# File extensions for our index files
GZIP_INDEX_EXT = '.gzi'
TAR_MANIFEST_EXT = '.json'

def create_index(tar_gz_path, force=False):
    """
    Creates a GZIP index (.gzi) and a TAR manifest (.json).
    This is a "slow" one-time operation per archive.
    This can be done simultaneously with listing of archive contents.
    """
    gzip_index_path = tar_gz_path + GZIP_INDEX_EXT
    tar_manifest_path = tar_gz_path + TAR_MANIFEST_EXT

    if os.path.exists(gzip_index_path) and os.path.exists(tar_manifest_path) and not force:
        print("Indexes already exist. Use --force to recreate them.")
        return

    print(f"Creating indexes for '{tar_gz_path}'...")
    start_time = time.time()

    tar_manifest = {}

    # Open the file using indexed_gzip to prepare for indexing
    # The constructor works with a positional filename, unlike the igzip.open() factory
    with igzip.IndexedGzipFile(tar_gz_path) as f:
        # 1. Create the GZIP seek-point index.
        # This is the most time-consuming part of the process.
        f.build_full_index()
        f.export_index(gzip_index_path)
        print(f"  -> GZIP index saved to '{gzip_index_path}'")

        # 2. Create the TAR manifest by reading the uncompressed stream.
        # Rewind the stream to the beginning for tarfile to read it.
        f.seek(0)

        # tarfile can work with file-like objects, including our indexed_gzip handle.
        with tarfile.open(fileobj=f, mode='r:') as tar:
            for member in tar.getmembers():
                if member.isfile():
                    # Store the name, data offset, and size of the file.
                    tar_manifest[member.name] = {
                        "offset": member.offset_data,
                        "size": member.size
                    }

    # 3. Save the manifest to a JSON file for quick lookups later.
    with open(tar_manifest_path, 'w') as f_json:
        json.dump(tar_manifest, f_json, indent=2)
    print(f"  -> TAR manifest saved to '{tar_manifest_path}'")

    end_time = time.time()
    print(f"Indexing completed in {end_time - start_time:.2f} seconds.")


def extract_file_fast(tar_gz_path, filename_to_extract, output_path):
    """
    Quickly extracts a single file using the pre-built indexes.
    """
    tar_manifest_path = tar_gz_path + TAR_MANIFEST_EXT

    if not os.path.exists(tar_manifest_path):
        print(f"Error: TAR manifest '{tar_manifest_path}' not found.")
        print(f"Please run the 'index' command first for '{tar_gz_path}'.")
        return

    print(f"Performing fast extraction of '{filename_to_extract}'...")
    start_time = time.time()

    # 1. Load the TAR manifest to find the file's location.
    with open(tar_manifest_path, 'r') as f_json:
        tar_manifest = json.load(f_json)

    if filename_to_extract not in tar_manifest:
        print(f"Error: File '{filename_to_extract}' not found in the archive's manifest.")
        return

    file_info = tar_manifest[filename_to_extract]
    offset = file_info['offset']
    size = file_info['size']

    print(f"  -> File found in manifest. Offset: {offset}, Size: {size} bytes.")

    # 2. Open the archive with indexed_gzip.
    # It will automatically find and use the companion .gzi index file.
    with igzip.IndexedGzipFile(tar_gz_path) as f:
        # 3. Seek to the file's data offset in the uncompressed stream.
        # THIS IS THE MAGIC! The library finds the nearest seek point
        # in the compressed stream and only decompresses a small chunk to get there.
        print(f"  -> Seeking to {offset} in the uncompressed stream...")
        f.seek(offset)

        # 4. Read exactly the number of bytes required for the file.
        print(f"  -> Reading {size} bytes...")
        file_data = f.read(size)

    # 5. Save the extracted data to the output file.
    with open(output_path, 'wb') as f_out:
        f_out.write(file_data)

    end_time = time.time()
    print(f"  -> File successfully extracted to '{output_path}'.")
    print(f"Extraction completed in {end_time - start_time:.4f} seconds.")


if __name__ == "__main__":
    # Set up the command-line argument parser
    parser = argparse.ArgumentParser(
        description="A utility for fast interaction with .tar.gz archives using an index."
    )
    subparsers = parser.add_subparsers(dest="command", required=True)

    # 'index' command
    parser_index = subparsers.add_parser("index", help="Create indexes for an archive.")
    parser_index.add_argument("archive", type=str, help="Path to the .tar.gz archive.")
    parser_index.add_argument("--force", action="store_true", help="Recreate indexes if they already exist.")

    # 'extract' command
    parser_extract = subparsers.add_parser("extract", help="Quickly extract a single file from the archive.")
    parser_extract.add_argument("archive", type=str, help="Path to the .tar.gz archive.")
    parser_extract.add_argument("file_in_archive", type=str, help="Name of the file inside the archive to extract.")
    parser_extract.add_argument("output_path", type=str, help="Where to save the extracted file.")

    args = parser.parse_args()

    # Execute the chosen command
    if args.command == "index":
        create_index(args.archive, args.force)
    elif args.command == "extract":
        extract_file_fast(args.archive, args.file_in_archive, args.output_path)

In this simple example, building the gzip index and the tar manifest are two separate "passes" through the source archive. In a full implementation, both can be done in one pass.

Don't forget to do

pip install indexed_gzip

sudo apt install python3-indexed-gzip

depending on your system

Usage:

python3 fast_tgz.py index large_archive.tar.gz
python3 fast_tgz.py extract large_archive.tar.gz file_4.txt file_4.txt

Also attached sample large_archive.tar.gz for testing

large_archive.tar.gz (github.com)

.tar.gz random access

A free file archiver for extremely high compression

Group

Searches

Help

#2571 .tar.gz random access

Discussion