dwarfs/thrift/metadata.thrift

/* vim:set ts=2 sw=2 sts=2 et: */
/**
 * \author     Marcus Holland-Moritz (github@mhxnet.de)
 * \copyright  Copyright (c) Marcus Holland-Moritz
 *
 * This file is part of dwarfs.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the “Software”), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * SPDX-License-Identifier: MIT
 */

include "thrift/annotation/cpp.thrift"

namespace cpp2 dwarfs.thrift.metadata

@cpp.Type{name = "uint8_t"}
typedef byte UInt8
@cpp.Type{name = "uint16_t"}
typedef i16 UInt16
@cpp.Type{name = "uint32_t"}
typedef i32 UInt32
@cpp.Type{name = "uint64_t"}
typedef i64 UInt64

/**
 * One chunk of data
 *
 * A single file inode can be composed of multiple chunks, e.g. because
 * segments can be reused or because a single file spans multiple blocks.
 * Chunks may be overlapping if there is identical data in different files.
 *
 * A chunk is really just a view onto an otherwise unstructured file system
 * block.
 */
struct chunk {
   1: UInt32 block        // file system block number
   2: UInt32 offset       // offset from start of block, in bytes
   3: UInt32 size         // size of chunk, in bytes
}

/**
 * One directory
 *
 * This structure represents the links between directory entries.
 * The `parent_entry` references the parent directory's `dir_entry`.
 * The `first_entry` members can be used to access the entries contained
 * in the directory.
 *
 * The range of contained entries is:
 *
 *    dir_entries[directory[inode].first_entry]
 *    ..
 *    dir_entries[directory[inode + 1].first_entry - 1]
 *
 * Note that as of v2.3, directory entries can be stored "packed", in
 * which case only the `first_entry` fields are populated and stored
 * delta-compressed. The `first_entry` field must be unpacked before
 * using and the `parent_entry` and `self_entry` fields must be built
 * by traversing the `dir_entries` using the unpacked `first_entry`
 * fields.
 */
struct directory {
   1: UInt32 parent_entry      // indexes into `dir_entries`

   2: UInt32 first_entry       // indexes into `dir_entries`

  //==========================================================//
  // fields added with dwarfs-0.11.0, file system version 2.5 //
  //==========================================================//

   3: UInt32 self_entry        // indexes into `dir_entries`
}

/**
 * Inode Data
 *
 * This structure contains all necessary metadata for an inode, such as
 * its mode (i.e. permissions and inode type), its owner/group and its
 * timestamps.
 */
struct inode_data {
   // index into `metadata.modes[]`
   2: UInt32 mode_index

   // index into `metadata.uids[]`
   4: UInt32 owner_index

   // index into `metadata.gids[]`
   5: UInt32 group_index

   // atime relative to `metadata.timestamp_base`
   6: UInt64 atime_offset

   // mtime relative to `metadata.timestamp_base`
   7: UInt64 mtime_offset

   // ctime relative to `metadata.timestamp_base`
   8: UInt64 ctime_offset

   /**
    * ==================================================================
    * NOTE: These fields has been deprecated with filesystem version 2.3
    *       They are still being used to read older filesystem versions.
    *       They do *not* occupy any space in version 2.3 and above.
    */

   // index into `metadata.names[]`
   1: UInt32 name_index_v2_2

   // inode number
   3: UInt32 inode_v2_2

   /* ==================================================================
    */
}

/**
 * A directory entry
 *
 * This structure represents a single directory entry and just combines
 * a name with an inode number. The inode number can then be used to
 * look up almost all other metadata.
 */
struct dir_entry {
   // index into metadata.names
   1: UInt32 name_index

   // index into metadata.inodes
   2: UInt32 inode_num
}

/**
 * File system options
 */
struct fs_options {
   // file system contains only mtime time stamps
   1: bool   mtime_only

   // time base and offsets are stored with this resolution
   // 1 = seconds, 60 = minutes, 3600 = hours, ...
   2: optional UInt32 time_resolution_sec

   3: bool   packed_chunk_table
   4: bool   packed_directories
   5: bool   packed_shared_files_table
}

/**
 * An (optionally packed) string table
 */
struct string_table {
   // raw buffer containing the concatenation of all individual,
   // potentially compressed, strings
   1: string buffer

   // symbol table for fsst compression; if fsst is not used, this
   // will not be set and `buffer` will contain uncompressed strings
   2: optional string symtab

   // the (optionally packed) index; if packed, the index is stored
   // delta-compressed
   3: list<UInt32> index

   // indicates if the index is packed
   4: bool packed_index
}

/*
 * For highly fragmented inodes, computing the size from the
 * individual chunks can be extremely slow. This cache can be
 * used to bypass the chunk lookup and size computation.
 */
struct inode_size_cache {
   // lookup from inode number to size
   1: map<UInt32, UInt64>  lookup

   // minimum number of chunks for a file to be found in the cache,
   // corresponds to scanner_options.inode_size_cache_min_chunk_count
   2: UInt64               min_chunk_count
}

/**
 * File System Metadata
 *
 * This is the root structure for all file system metadata.
 */
struct metadata {
   /**
    * Ranges of chunks that make up regular files. Identical
    * files share the same chunk range. The range of chunks
    * for a regular file are:
    *
    *   chunks[chunk_table[index]] .. chunks[chunk_table[index + 1] - 1]
    *
    * Here, `index` is either `inode - file_inode_offset` for
    * unique file inodes, or for shared file inodes:
    *
    *   shared_files[inode - file_inode_offset - unique_files] + unique_files
    *
    * Note that here `shared_files` is the unpacked version of
    * `shared_files_table`.
    */
   1: list<chunk>      chunks

   /**
    * All directories, indexed by inode number. There's one extra
    * sentinel directory at the end that has `first_entry` point to
    * the end of `dir_entries`, so directory entry lookup work the
    * same for all directories.
    *
    * Note that this list is stored in a packed format as of v2.3
    * if `options.packed_directories` is `true` and must be unpacked
    * before use. See the documentation for the `directory` struct.
    */
   2: list<directory>  directories

   /**
    * Inode metadata, indexed by inode number.
    *
    * Inodes are assigned strictly in the following order:
    *
    *   - directories, starting with the root dir at inode 0
    *   - symbolic links
    *   - unique regular files
    *   - shared regular files
    *   - character and block devices
    *   - named pipes and sockets
    *
    * The inode type can be determined from its mode, which makes
    * it possible to find the inode offsets for each distinct type
    * by a simple binary search. These inode offsets are required
    * to perform lookups into lists indexed by non-directory inode
    * numbers.
    *
    * The number of shared regular files can be determined from
    * `shared_files_table`.
    */
   3: list<inode_data> inodes

   /**
    * Chunk lookup table, indexed by `inode - file_inode_offset`.
    * There's one extra sentinel item at the end that points to the
    * end of `chunks`, so chunk lookups work the same for all inodes.
    *
    * Note that this list is stored delta-compressed as of v2.3
    * if `options.packed_chunk_table` is `true` and must be unpacked
    * before use.
    */
   4: list<UInt32>     chunk_table

   /**
    * =========================================================================
    * NOTE: This has been deprecated with filesystem version 2.3
    *       It is still being used to read older filesystem versions.
    */
   5: list<UInt32>     entry_table_v2_2
   /* =========================================================================
    */

   // symlink lookup table, indexed by `inode - symlink_inode_offset`
   6: list<UInt32>     symlink_table

   // user ids, for lookup by `inode.owner_index`
   7: list<UInt32>     uids

   // group ids, for lookup by `inode.group_index`
   8: list<UInt32>     gids

   // inode modes, for lookup by `inode.mode_index`
   9: list<UInt32>     modes

   // directory entry names, for lookup by `dir_entry.name_index`
  10: list<string>     names

   // symlink targets, for lookup by index from `symlink_table`
  11: list<string>     symlinks

   // timestamp base for all inode timestamps
  12: UInt64           timestamp_base

  /************************ DEPRECATED **********************
   *
   * These are redundant and can be determined at run-time
   * with a simple binary search. Compatibility is not
   * affected.
   *
   *   13: UInt32          chunk_inode_offset;
   *   14: UInt32          link_inode_offset;
   *
   *********************************************************/

   // file system block size in bytes
  15: UInt32           block_size

   // total file system size in bytes
  16: UInt64           total_fs_size

  //=========================================================//
  // fields added with dwarfs-0.3.0, file system version 2.1 //
  //=========================================================//

   // device ids, for lookup by `inode - device_inode_offset`
  17: optional list<UInt64>     devices

   // file system options
  18: optional fs_options       options

  //=========================================================//
  // fields added with dwarfs-0.5.0, file system version 2.3 //
  //=========================================================//

   /**
    * All directory entries
    *
    * Starting with the root directory entry at index 0, this
    * list contains ranges all directory entries of the file
    * system. Along with `directories`, this allows traversal
    * of the full file system structure.
    *
    * The ranges of entries that belong to a single directory
    * are determined by `directory.first_entry`. Within a single
    * directory, entries are ordered asciibetically by name,
    * which makes it possible to efficiently find entries using
    * binary search.
    */
  19: optional list<dir_entry>  dir_entries

   /**
    * Shared files mapping
    *
    * Note that this list is stored in a packed format if
    * `options.packed_shared_files_table` is `true` and must be
    * unpacked before use.
    *
    * In packed format, it is stored as number of repetitions
    * per index, offset by 2 (the minimum number of repetitions),
    * so e.g. a packed list
    *
    *   [0, 3, 1, 0, 1]
    *
    * would unpack to:
    *
    *   [0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4]
    *
    * So the packed 5-element array provides mappings for 15 shared
    * file inodes. Assuming 10 unique files and a file inode offset
    * of 10, a regular file inode 25 would be a shared file inode,
    * and the index for lookup in `chunk_table` would be `10 + 1`.
    */
  20: optional list<UInt32>     shared_files_table

   // total size of hardlinked files beyond the first link, in bytes
  21: optional UInt64           total_hardlink_size

   // version string
  22: optional string           dwarfs_version

   // unix timestamp of metadata creation time
  23: optional UInt64           create_timestamp

  24: optional string_table     compact_names

  25: optional string_table     compact_symlinks

  //=========================================================//
  // fields added with dwarfs-0.7.0, file system version 2.5 //
  //=========================================================//

   // preferred path separator of original file system
  26: optional UInt32           preferred_path_separator

  //=========================================================//
  // fields added with dwarfs-0.7.3, file system version 2.5 //
  //=========================================================//

  // We don't need to increment the file system minor version
  // as file systems created with this new version are still
  // readable by older binaries as long as they don't use any
  // unsupported features (e.g. FLAC compression).

  // The set of features used in this file system image. As long
  // as an older binary supports all features, it will be able
  // to use images created with newer versions. We use strings
  // here instead of an enum so older versions can still output
  // names of features used by a newer version.
  27: optional set<string>      features

  //=========================================================//
  // fields added with dwarfs-0.8.0, file system version 2.5 //
  //=========================================================//

  // The set of categories used in this file system image. Used
  // for displaying and to select compression algorithms when
  // recompressing the image.
  28: optional list<string>     category_names

  // The category of each block in the file system image. The
  // index into this vector is the block number and the value
  // is an index into `category_names`.
  29: optional list<UInt32>     block_categories

  //==========================================================//
  // fields added with dwarfs-0.11.0, file system version 2.5 //
  //==========================================================//

  // Size cache for highly fragmented file inodes
  30: optional inode_size_cache reg_file_size_cache
}