Layer 2: Storage

Single-purpose storage interfaces. Each store does one thing.

JournalStore

SQL statement buffer for write-ahead logging.

`ftm_lakehouse.storage.JournalStore = SqlJournalStore` `module-attribute`

ParquetStore

Delta Lake parquet storage for statements, partitioned by (shard, bucket, origin). Writes are append-only; deduplication, first_seen folding, and tombstone reaping happen in three independent async ops (compact / merge / vacuum), all coordinated by a dataset-wide write fence.

`ftm_lakehouse.storage.ParquetStore`

Bases: LakehouseApiMixin

Single Delta Lake table (per dataset) partitioned by (shard, bucket, origin).

Writes are append-only: :meth:append sorts a per-partition batch in memory and writes one parquet file. Reads dedupe on the fly via the deduped statement view registered on the :class:LakeStore connection; :meth:_query_statement_data iterates (shard, bucket) partitions so the window function stays bounded per iteration. :meth:merge, :meth:compact, :meth:vacuum provide physical cleanup but are no longer load-bearing for query correctness.

Source code in ftm_lakehouse/storage/parquet.py

class ParquetStore(LakehouseApiMixin):
    """Single Delta Lake table (per dataset) partitioned by ``(shard, bucket,
    origin)``.

    Writes are append-only: :meth:`append` sorts a per-partition batch in
    memory and writes one parquet file. Reads dedupe on the fly via the
    deduped ``statement`` view registered on the :class:`LakeStore`
    connection; :meth:`_query_statement_data` iterates ``(shard, bucket)``
    partitions so the window function stays bounded per iteration.
    :meth:`merge`, :meth:`compact`, :meth:`vacuum` provide physical
    cleanup but are no longer load-bearing for query correctness.
    """

    def __init__(self, uri: Uri, dataset: str, shards: int | None = None) -> None:
        self.uri = join_uri(uri, path.STATEMENTS)
        super().__init__(self.uri)
        self.settings = Settings()
        self.dataset = dataset
        self.shards = shards if shards is not None else DEFAULT_SHARDS
        self._store = get_store(uri)
        self._lake = LakeStore(
            uri=str(self.uri),
            dataset=self.dataset,
            partition_by=PARTITIONS,
            view_sqls={
                TABLE.name: dedupe_view_sql,
                TABLE_RAW.name: raw_view_sql,
            },
            duckdb_config=duckdb_config(),
        )
        self.log = get_logger(
            f"{self.dataset}.{self.__class__.__name__}",
            dataset=self.dataset,
            uri=mask_uri(self.uri),
        )
        setup_duckdb_storage()

    @property
    def deltatable(self) -> DeltaTable:
        return self._lake.deltatable

    @property
    def version(self) -> int | None:
        """Current version of the main Delta table."""
        if self._lake.exists:
            return self._lake.deltatable.version()

    @property
    def exists(self) -> bool:
        """Check existence of deltatable"""
        return self._lake.exists

    @no_api
    def view(self) -> LakeQueryView:
        """Get a view for querying statements."""
        return self._lake.default_view()

    @no_api
    def get(self, entity_id: str) -> StatementEntity | None:
        """Lookup an Entity by its ID"""
        stmts = list(self.get_statements(entity_id))
        if stmts:
            return StatementEntity.from_statements(make_dataset(self.dataset), stmts)

    @no_api
    def query(self, q: Query | None = None) -> StatementEntities:
        """
        Query Entities from the store.

        Args:
            q: Optional Query object with filters

        Yields:
            StatementEntity objects matching the query
        """
        sql = (q or Query()).sql.statements
        for data in self._query_data(sql):
            yield data.to_entity()

    @no_api
    def query_statements(self, q: Select | None = None) -> Statements:
        """
        Query ordered Statements from the store.

        Args:
            q: Optional SQLAlchemy query (default: Query().sql.statements)

        Yields:
            Statement objects matching the query
        """
        for stmt_dict in self._query_statement_data(q):
            yield Statement.from_dict(stmt_dict)

    @no_api
    def get_statements(self, entity_id: str) -> Statements:
        """Query all live statements for a single entity.

        Scopes :meth:`_query_statement_data` iteration to the entity's
        own shard so single-entity lookups don't fan out to every
        ``(shard, bucket)`` pair.
        """
        if not self.exists:
            return
        shard = path.entity_shard(entity_id, self.shards)
        q = select(TABLE).where(TABLE.c.shard == shard, TABLE.c.entity_id == entity_id)
        for stmt_dict in self._query_statement_data(q, shard=shard):
            yield Statement.from_dict(stmt_dict)

    @no_api
    def stats(self) -> DatasetStats:
        """Compute statistics from the statement store.

        Runs ftmq's aggregation SQL through the deduped ``statement``
        view. Assumes an optimized store – run ``optimize`` (merge +
        compact + vacuum) before heavy stats workloads. Results are
        correct on an unoptimized store too; the dedupe window just
        makes the scan slower.
        """
        return self._lake.default_view().stats()

    def _write_lock(self) -> Lock:
        """Dataset-wide write fence.

        All Delta writers (``append``, ``merge``, ``compact``, ``vacuum``)
        acquire this lock so they can't race on the same partition. The lock
        lives at ``{dataset_root}/.LOCK`` per ``path.LOCK``.

        Acquisition is bounded by ``settings.lock_max_retries`` (total wait
        roughly ``N²/2`` seconds); entering the returned lock raises
        ``RuntimeError`` when the fence stays busy, so contended writers fail
        instead of pinning a thread forever. A lock left behind by a crashed
        writer must be released manually via :meth:`unlock`
        (``ftm-lakehouse operations unlock``).
        """
        return Lock(
            self._store, key=path.LOCK, max_retries=self.settings.lock_max_retries
        )

    @no_api
    def unlock(self) -> bool:
        """Forcibly release the dataset write fence.

        Operator escape hatch for the case where a writer process died
        with the lock held (or an attacker held it on purpose). The lock
        is just a file at ``{dataset_root}/.LOCK``; this method deletes
        it.

        **Use sparingly** – breaking a lock that's still held by a live
        writer can corrupt a write in flight. Confirm no process is
        actively writing before running.

        Returns:
            ``True`` if a lock was released, ``False`` if no lock was
            held.
        """
        if not self._store.exists(path.LOCK):
            return False
        self._store.delete(path.LOCK)
        return True

    @no_api
    def append(self, batch: pa.Table) -> None:
        """Append a sorted batch of statements.

        The batch should be scoped to a single ``shard`` for write efficiency
        (one parquet file per ``(shard, bucket, origin)`` partition). The
        method sorts by ``(bucket, origin, entity_id, id, last_seen DESC)``
        then splits by ``bucket`` so each ``write_deltalake`` call uses the
        bucket-appropriate ``writer_properties`` (small vs. large profile).
        Duplicates land as separate rows and are reaped by :meth:`merge`.

        Held under the dataset write fence so concurrent :meth:`merge` /
        :meth:`compact` / :meth:`vacuum` can't tombstone an in-flight append.

        Args:
            batch: PyArrow table with the columns of
                :data:`ftm_lakehouse.model.statement.SHARDED_SCHEMA`. Rows
                should already be scoped to a single shard.
        """
        if len(batch) == 0:
            return

        batch = batch.sort_by(
            [
                ("bucket", "ascending"),
                ("origin", "ascending"),
                ("entity_id", "ascending"),
                ("id", "ascending"),
                ("last_seen", "descending"),
            ]
        )
        with self._write_lock():
            mode = "append" if self.exists else "overwrite"
            for bucket in pc.unique(batch["bucket"]).to_pylist():
                sub = batch.filter(pc.equal(batch["bucket"], bucket))
                write_deltalake(
                    str(self.uri),
                    sub,
                    partition_by=PARTITIONS,
                    mode=mode,
                    writer_properties=writer_for_bucket(bucket),
                    storage_options=storage_options(),
                )
                # After the first sub-batch, the table exists for subsequent buckets.
                mode = "append"

    @no_api
    def merge(self, grace_period_days: int | None = None) -> None:
        """Collapse duplicates and reap expired tombstones, partition by partition.

        For each ``(shard, bucket, origin)`` partition, runs the merge
        query against ``statement_raw`` (keep latest row per ``id`` by
        ``last_seen DESC``; fold ``first_seen`` to the min; drop
        tombstones older than the grace cutoff) and atomically
        overwrites that partition via ``partition_filters``. Held under
        the dataset write fence (``path.LOCK``).

        Physical cleanup only – the deduped read-time view already
        produces the right query results without ``merge`` having run,
        so this is purely about reclaiming disk space and reaping
        tombstones past the grace window.

        Args:
            grace_period_days: Override ``settings.grace_period_days``. Pass
                ``0`` to drop tombstones immediately.
        """
        if not self.exists:
            return
        days = (
            grace_period_days
            if grace_period_days is not None
            else self.settings.grace_period_days
        )
        grace_cutoff = datetime.now(timezone.utc) - timedelta(days=days)
        with self._write_lock():
            for shard, bucket, origin in self._list_partitions():
                merge_select = build_merge_query(shard, bucket, origin, grace_cutoff)
                sql = str(merge_select.compile(compile_kwargs={"literal_binds": True}))
                with self._lake.cursor() as cur:
                    # ``to_arrow_reader`` yields a pyarrow RecordBatchReader
                    # that DuckDB streams lazily from its execution
                    # pipeline; ``write_deltalake`` consumes the reader
                    # batch by batch, so the merge never materialises the
                    # full partition in Python memory.
                    reader = cur.execute(sql).to_arrow_reader()
                    write_deltalake(
                        str(self.uri),
                        reader,
                        mode="overwrite",
                        partition_by=PARTITIONS,
                        predicate=(
                            f"shard = '{shard}' AND bucket = '{bucket}' "
                            f"AND origin = '{origin}'"
                        ),
                        writer_properties=writer_for_bucket(bucket),
                        storage_options=storage_options(),
                    )

    @no_api
    def compact(self) -> None:
        """Bin-pack small parquet files within each partition.

        Cheap maintenance – Delta's ``OPTIMIZE compact`` only rewrites small
        files into larger ones; it does not collapse duplicate rows or drop
        tombstones (use :meth:`merge` for that). Held under the dataset write
        fence (``path.LOCK``).
        """
        if not self.exists:
            return
        with self._write_lock():
            for shard, bucket, origin in self._list_partitions():
                self.deltatable.optimize.compact(
                    partition_filters=[
                        ("shard", "=", shard),
                        ("bucket", "=", bucket),
                        ("origin", "=", origin),
                    ],
                    writer_properties=writer_for_bucket(bucket),
                )

    @no_api
    def vacuum(self, retention_hours: int = 0) -> None:
        """Delete obsolete parquet files no longer referenced by the Delta log.

        Tombstoned files (replaced by :meth:`merge` / :meth:`compact`) become
        orphans on disk; vacuum prunes them once they're past
        ``retention_hours``. Held under the dataset write fence
        (``path.LOCK``).

        Args:
            retention_hours: Keep files newer than this many hours. ``0``
                drops every file the Delta log no longer references.
        """
        if not self.exists:
            return
        with self._write_lock():
            self.deltatable.vacuum(
                retention_hours=retention_hours,
                dry_run=False,
                enforce_retention_duration=False,
            )

    @no_api
    def export_csv(self, key: str, q: Select | None = None) -> None:
        """Export statements to a sorted CSV file."""
        if not self.exists:
            return
        items = self._query_statement_data(q)
        with self._store.open(key, "w") as f:
            smart_write_csv(f, items)

    @no_api
    def get_changed_entity_ids(
        self,
        since: datetime,
        schemata: list[str] | None = None,
        prop: str | None = None,
    ) -> Iterator[str]:
        """Get entity IDs touched since a timestamp.

        Catches both *new* / *modified* statements (``first_seen >= since``)
        and *deleted* ones (``deleted_at >= since``) – the latter so the diff
        consumer can emit DEL ops for entities whose tombstone landed after
        the last diff state. Targets ``statement_raw`` because the deduped
        view filters tombstones; we need them visible here.
        """
        if not self.exists:
            return

        since_truncated = since.replace(microsecond=0)
        sql = (
            select(TABLE_RAW)
            .distinct(TABLE_RAW.c.entity_id)
            .where(
                or_(
                    TABLE_RAW.c.first_seen >= since_truncated,
                    TABLE_RAW.c.deleted_at >= since_truncated,
                )
            )
        )
        if schemata:
            sql = sql.where(TABLE_RAW.c.schema.in_(schemata))
        if prop:
            sql = sql.where(TABLE_RAW.c.prop == prop)
        seen: set[str] = set()
        for shard, _bucket in self._iter_shard_buckets():
            scoped = sql.where(TABLE_RAW.c.shard == shard)
            for row in self._lake._execute(scoped):
                if row.entity_id not in seen:
                    seen.add(row.entity_id)
                    yield row.entity_id

    @no_api
    def destroy(self) -> None:
        """
        Destroy the deltalake by removing the transaction log in "_delta_log"
        directory. This is soft deleting, as the parquet files remain (but will
        be cleaned up on optimize --vacuum)
        """
        with Took() as t:
            self.log.warn("🔥 Destroying deltalake store ...")
            for key in self._lake._backend.iterate_keys("_delta_log"):
                self._lake._backend.delete(key)
        self.log.info("Deleted statement store.", took=t.took)

    def _list_partitions(self) -> list[tuple[str, str, str]]:
        """List all ``(shard, bucket, origin)`` triples currently in the table.

        Queries ``statement_raw`` so the enumeration scans the underlying
        Delta partitions directly without going through the deduped
        view's window function.
        """
        if not self.exists:
            return []
        with self._lake.cursor() as cur:
            rows = cur.execute(
                f"SELECT DISTINCT shard, bucket, origin FROM {TABLE_RAW.name} "
                "ORDER BY shard, bucket, origin"
            ).fetchall()
        return [(s, b, o) for s, b, o in rows]

    def _iter_shard_buckets(
        self, shard: str | None = None
    ) -> Iterator[tuple[str, str]]:
        """Yield unique ``(shard, bucket)`` pairs from existing partitions.

        Dedupe-aware reads (:meth:`_query_statement_data`) iterate per
        ``(shard, bucket)`` because entity IDs (and thus statement IDs)
        are uniquely placed in one ``(shard, bucket)`` by the model
        layer. Adding ``WHERE shard = ? AND bucket = ?`` to each
        iteration pushes through DuckDB's predicate pushdown to the
        deduped view's parquet scan, keeping the window function input
        bounded to one parquet file's worth of rows.

        Args:
            shard: Optional shard filter. When given, only ``(shard,
                bucket)`` pairs for that shard are yielded – lets
                single-entity lookups skip the other shards.
        """
        seen: set[tuple[str, str]] = set()
        for s, b, _origin in self._list_partitions():
            if shard is not None and s != shard:
                continue
            key = (s, b)
            if key not in seen:
                seen.add(key)
                yield s, b

    def _query_statement_data(
        self, q: Select | None = None, *, shard: str | None = None
    ) -> Iterator[StatementDict]:
        """Query statement dicts via dedupe-on-read, bypassing FtM construction.

        Iterates over ``(shard, bucket)`` partitions, adding ``WHERE
        shard = ? AND bucket = ?`` to each query so the deduped
        ``statement`` view's window function operates on one partition
        at a time (DuckDB pushes the predicates through to the parquet
        scan's File Filters). The live view is correct without running
        :meth:`merge`: each statement id surfaces at most once,
        carrying the earliest ``first_seen`` and the latest
        ``last_seen``; tombstones are filtered out post-dedupe so a
        re-add of a deleted entity still surfaces.

        Args:
            q: Optional SQLAlchemy select (default:
                ``Query().sql.statements``).
            shard: Optional shard filter passed through to
                :meth:`_iter_shard_buckets` to scope iteration to one
                shard – used by single-entity lookups.

        Yields:
            StatementDict instances.
        """
        if q is None:
            q = Query().sql.statements
        for s, b in self._iter_shard_buckets(shard=shard):
            scoped = q.where(column("shard") == s, column("bucket") == b)
            for row in self._lake._execute(scoped):
                yield StatementDict(**vars(row))

    def _query_data(self, q: Select | None = None) -> Iterator[EntityPayload]:
        """
        Query entity dicts via aggregate_unsafe(), bypassing FtM object construction.

        Args:
            q: Optional SQLAlchemy select (default: Query().sql.statements)

        Yields:
            EntityPayload instances
        """
        if not self.exists:
            return
        yield from aggregate_unsafe(self._query_statement_data(q), self.dataset)

`exists` `property`

Check existence of deltatable

`version` `property`

Current version of the main Delta table.

`append(batch)`

Append a sorted batch of statements.

The batch should be scoped to a single shard for write efficiency (one parquet file per (shard, bucket, origin) partition). The method sorts by (bucket, origin, entity_id, id, last_seen DESC) then splits by bucket so each write_deltalake call uses the bucket-appropriate writer_properties (small vs. large profile). Duplicates land as separate rows and are reaped by :meth:merge.

Held under the dataset write fence so concurrent :meth:merge / :meth:compact / :meth:vacuum can't tombstone an in-flight append.

Parameters:

Name	Type	Description	Default
`batch`	`Table`	PyArrow table with the columns of :data:`ftm_lakehouse.model.statement.SHARDED_SCHEMA`. Rows should already be scoped to a single shard.	required

Source code in ftm_lakehouse/storage/parquet.py

@no_api
def append(self, batch: pa.Table) -> None:
    """Append a sorted batch of statements.

    The batch should be scoped to a single ``shard`` for write efficiency
    (one parquet file per ``(shard, bucket, origin)`` partition). The
    method sorts by ``(bucket, origin, entity_id, id, last_seen DESC)``
    then splits by ``bucket`` so each ``write_deltalake`` call uses the
    bucket-appropriate ``writer_properties`` (small vs. large profile).
    Duplicates land as separate rows and are reaped by :meth:`merge`.

    Held under the dataset write fence so concurrent :meth:`merge` /
    :meth:`compact` / :meth:`vacuum` can't tombstone an in-flight append.

    Args:
        batch: PyArrow table with the columns of
            :data:`ftm_lakehouse.model.statement.SHARDED_SCHEMA`. Rows
            should already be scoped to a single shard.
    """
    if len(batch) == 0:
        return

    batch = batch.sort_by(
        [
            ("bucket", "ascending"),
            ("origin", "ascending"),
            ("entity_id", "ascending"),
            ("id", "ascending"),
            ("last_seen", "descending"),
        ]
    )
    with self._write_lock():
        mode = "append" if self.exists else "overwrite"
        for bucket in pc.unique(batch["bucket"]).to_pylist():
            sub = batch.filter(pc.equal(batch["bucket"], bucket))
            write_deltalake(
                str(self.uri),
                sub,
                partition_by=PARTITIONS,
                mode=mode,
                writer_properties=writer_for_bucket(bucket),
                storage_options=storage_options(),
            )
            # After the first sub-batch, the table exists for subsequent buckets.
            mode = "append"

`compact()`

Bin-pack small parquet files within each partition.

Cheap maintenance – Delta's OPTIMIZE compact only rewrites small files into larger ones; it does not collapse duplicate rows or drop tombstones (use :meth:merge for that). Held under the dataset write fence (path.LOCK).

Source code in ftm_lakehouse/storage/parquet.py

@no_api
def compact(self) -> None:
    """Bin-pack small parquet files within each partition.

    Cheap maintenance – Delta's ``OPTIMIZE compact`` only rewrites small
    files into larger ones; it does not collapse duplicate rows or drop
    tombstones (use :meth:`merge` for that). Held under the dataset write
    fence (``path.LOCK``).
    """
    if not self.exists:
        return
    with self._write_lock():
        for shard, bucket, origin in self._list_partitions():
            self.deltatable.optimize.compact(
                partition_filters=[
                    ("shard", "=", shard),
                    ("bucket", "=", bucket),
                    ("origin", "=", origin),
                ],
                writer_properties=writer_for_bucket(bucket),
            )

`destroy()`

Destroy the deltalake by removing the transaction log in "_delta_log" directory. This is soft deleting, as the parquet files remain (but will be cleaned up on optimize --vacuum)

Source code in ftm_lakehouse/storage/parquet.py

@no_api
def destroy(self) -> None:
    """
    Destroy the deltalake by removing the transaction log in "_delta_log"
    directory. This is soft deleting, as the parquet files remain (but will
    be cleaned up on optimize --vacuum)
    """
    with Took() as t:
        self.log.warn("🔥 Destroying deltalake store ...")
        for key in self._lake._backend.iterate_keys("_delta_log"):
            self._lake._backend.delete(key)
    self.log.info("Deleted statement store.", took=t.took)

`export_csv(key, q=None)`

Export statements to a sorted CSV file.

Source code in ftm_lakehouse/storage/parquet.py

@no_api
def export_csv(self, key: str, q: Select | None = None) -> None:
    """Export statements to a sorted CSV file."""
    if not self.exists:
        return
    items = self._query_statement_data(q)
    with self._store.open(key, "w") as f:
        smart_write_csv(f, items)

`get(entity_id)`

Lookup an Entity by its ID

Source code in ftm_lakehouse/storage/parquet.py

@no_api
def get(self, entity_id: str) -> StatementEntity | None:
    """Lookup an Entity by its ID"""
    stmts = list(self.get_statements(entity_id))
    if stmts:
        return StatementEntity.from_statements(make_dataset(self.dataset), stmts)

`get_changed_entity_ids(since, schemata=None, prop=None)`

Get entity IDs touched since a timestamp.

Catches both new / modified statements (first_seen >= since) and deleted ones (deleted_at >= since) – the latter so the diff consumer can emit DEL ops for entities whose tombstone landed after the last diff state. Targets statement_raw because the deduped view filters tombstones; we need them visible here.

Source code in ftm_lakehouse/storage/parquet.py

@no_api
def get_changed_entity_ids(
    self,
    since: datetime,
    schemata: list[str] | None = None,
    prop: str | None = None,
) -> Iterator[str]:
    """Get entity IDs touched since a timestamp.

    Catches both *new* / *modified* statements (``first_seen >= since``)
    and *deleted* ones (``deleted_at >= since``) – the latter so the diff
    consumer can emit DEL ops for entities whose tombstone landed after
    the last diff state. Targets ``statement_raw`` because the deduped
    view filters tombstones; we need them visible here.
    """
    if not self.exists:
        return

    since_truncated = since.replace(microsecond=0)
    sql = (
        select(TABLE_RAW)
        .distinct(TABLE_RAW.c.entity_id)
        .where(
            or_(
                TABLE_RAW.c.first_seen >= since_truncated,
                TABLE_RAW.c.deleted_at >= since_truncated,
            )
        )
    )
    if schemata:
        sql = sql.where(TABLE_RAW.c.schema.in_(schemata))
    if prop:
        sql = sql.where(TABLE_RAW.c.prop == prop)
    seen: set[str] = set()
    for shard, _bucket in self._iter_shard_buckets():
        scoped = sql.where(TABLE_RAW.c.shard == shard)
        for row in self._lake._execute(scoped):
            if row.entity_id not in seen:
                seen.add(row.entity_id)
                yield row.entity_id

`get_statements(entity_id)`

Query all live statements for a single entity.

Scopes :meth:_query_statement_data iteration to the entity's own shard so single-entity lookups don't fan out to every (shard, bucket) pair.

Source code in ftm_lakehouse/storage/parquet.py

@no_api
def get_statements(self, entity_id: str) -> Statements:
    """Query all live statements for a single entity.

    Scopes :meth:`_query_statement_data` iteration to the entity's
    own shard so single-entity lookups don't fan out to every
    ``(shard, bucket)`` pair.
    """
    if not self.exists:
        return
    shard = path.entity_shard(entity_id, self.shards)
    q = select(TABLE).where(TABLE.c.shard == shard, TABLE.c.entity_id == entity_id)
    for stmt_dict in self._query_statement_data(q, shard=shard):
        yield Statement.from_dict(stmt_dict)

`merge(grace_period_days=None)`

Collapse duplicates and reap expired tombstones, partition by partition.

For each (shard, bucket, origin) partition, runs the merge query against statement_raw (keep latest row per id by last_seen DESC; fold first_seen to the min; drop tombstones older than the grace cutoff) and atomically overwrites that partition via partition_filters. Held under the dataset write fence (path.LOCK).

Physical cleanup only – the deduped read-time view already produces the right query results without merge having run, so this is purely about reclaiming disk space and reaping tombstones past the grace window.

Parameters:

Name	Type	Description	Default
`grace_period_days`	`int \| None`	Override `settings.grace_period_days`. Pass `0` to drop tombstones immediately.	`None`

Source code in ftm_lakehouse/storage/parquet.py

@no_api
def merge(self, grace_period_days: int | None = None) -> None:
    """Collapse duplicates and reap expired tombstones, partition by partition.

    For each ``(shard, bucket, origin)`` partition, runs the merge
    query against ``statement_raw`` (keep latest row per ``id`` by
    ``last_seen DESC``; fold ``first_seen`` to the min; drop
    tombstones older than the grace cutoff) and atomically
    overwrites that partition via ``partition_filters``. Held under
    the dataset write fence (``path.LOCK``).

    Physical cleanup only – the deduped read-time view already
    produces the right query results without ``merge`` having run,
    so this is purely about reclaiming disk space and reaping
    tombstones past the grace window.

    Args:
        grace_period_days: Override ``settings.grace_period_days``. Pass
            ``0`` to drop tombstones immediately.
    """
    if not self.exists:
        return
    days = (
        grace_period_days
        if grace_period_days is not None
        else self.settings.grace_period_days
    )
    grace_cutoff = datetime.now(timezone.utc) - timedelta(days=days)
    with self._write_lock():
        for shard, bucket, origin in self._list_partitions():
            merge_select = build_merge_query(shard, bucket, origin, grace_cutoff)
            sql = str(merge_select.compile(compile_kwargs={"literal_binds": True}))
            with self._lake.cursor() as cur:
                # ``to_arrow_reader`` yields a pyarrow RecordBatchReader
                # that DuckDB streams lazily from its execution
                # pipeline; ``write_deltalake`` consumes the reader
                # batch by batch, so the merge never materialises the
                # full partition in Python memory.
                reader = cur.execute(sql).to_arrow_reader()
                write_deltalake(
                    str(self.uri),
                    reader,
                    mode="overwrite",
                    partition_by=PARTITIONS,
                    predicate=(
                        f"shard = '{shard}' AND bucket = '{bucket}' "
                        f"AND origin = '{origin}'"
                    ),
                    writer_properties=writer_for_bucket(bucket),
                    storage_options=storage_options(),
                )

`query(q=None)`

Query Entities from the store.

Parameters:

Name	Type	Description	Default
`q`	`Query \| None`	Optional Query object with filters	`None`

Yields:

Type	Description
`StatementEntities`	StatementEntity objects matching the query

Source code in ftm_lakehouse/storage/parquet.py

@no_api
def query(self, q: Query | None = None) -> StatementEntities:
    """
    Query Entities from the store.

    Args:
        q: Optional Query object with filters

    Yields:
        StatementEntity objects matching the query
    """
    sql = (q or Query()).sql.statements
    for data in self._query_data(sql):
        yield data.to_entity()

`query_statements(q=None)`

Query ordered Statements from the store.

Parameters:

Name	Type	Description	Default
`q`	`Select \| None`	Optional SQLAlchemy query (default: Query().sql.statements)	`None`

Yields:

Type	Description
`Statements`	Statement objects matching the query

Source code in ftm_lakehouse/storage/parquet.py

@no_api
def query_statements(self, q: Select | None = None) -> Statements:
    """
    Query ordered Statements from the store.

    Args:
        q: Optional SQLAlchemy query (default: Query().sql.statements)

    Yields:
        Statement objects matching the query
    """
    for stmt_dict in self._query_statement_data(q):
        yield Statement.from_dict(stmt_dict)

`stats()`

Compute statistics from the statement store.

Runs ftmq's aggregation SQL through the deduped statement view. Assumes an optimized store – run optimize (merge + compact + vacuum) before heavy stats workloads. Results are correct on an unoptimized store too; the dedupe window just makes the scan slower.

Source code in ftm_lakehouse/storage/parquet.py

@no_api
def stats(self) -> DatasetStats:
    """Compute statistics from the statement store.

    Runs ftmq's aggregation SQL through the deduped ``statement``
    view. Assumes an optimized store – run ``optimize`` (merge +
    compact + vacuum) before heavy stats workloads. Results are
    correct on an unoptimized store too; the dedupe window just
    makes the scan slower.
    """
    return self._lake.default_view().stats()

`unlock()`

Forcibly release the dataset write fence.

Operator escape hatch for the case where a writer process died with the lock held (or an attacker held it on purpose). The lock is just a file at {dataset_root}/.LOCK; this method deletes it.

Use sparingly – breaking a lock that's still held by a live writer can corrupt a write in flight. Confirm no process is actively writing before running.

Returns:

Type	Description
`bool`	`True` if a lock was released, `False` if no lock was
`bool`	held.

Source code in ftm_lakehouse/storage/parquet.py

@no_api
def unlock(self) -> bool:
    """Forcibly release the dataset write fence.

    Operator escape hatch for the case where a writer process died
    with the lock held (or an attacker held it on purpose). The lock
    is just a file at ``{dataset_root}/.LOCK``; this method deletes
    it.

    **Use sparingly** – breaking a lock that's still held by a live
    writer can corrupt a write in flight. Confirm no process is
    actively writing before running.

    Returns:
        ``True`` if a lock was released, ``False`` if no lock was
        held.
    """
    if not self._store.exists(path.LOCK):
        return False
    self._store.delete(path.LOCK)
    return True

`vacuum(retention_hours=0)`

Delete obsolete parquet files no longer referenced by the Delta log.

Tombstoned files (replaced by :meth:merge / :meth:compact) become orphans on disk; vacuum prunes them once they're past retention_hours. Held under the dataset write fence (path.LOCK).

Parameters:

Name	Type	Description	Default
`retention_hours`	`int`	Keep files newer than this many hours. `0` drops every file the Delta log no longer references.	`0`

Source code in ftm_lakehouse/storage/parquet.py

@no_api
def vacuum(self, retention_hours: int = 0) -> None:
    """Delete obsolete parquet files no longer referenced by the Delta log.

    Tombstoned files (replaced by :meth:`merge` / :meth:`compact`) become
    orphans on disk; vacuum prunes them once they're past
    ``retention_hours``. Held under the dataset write fence
    (``path.LOCK``).

    Args:
        retention_hours: Keep files newer than this many hours. ``0``
            drops every file the Delta log no longer references.
    """
    if not self.exists:
        return
    with self._write_lock():
        self.deltatable.vacuum(
            retention_hours=retention_hours,
            dry_run=False,
            enforce_retention_duration=False,
        )

`view()`

Get a view for querying statements.

Source code in ftm_lakehouse/storage/parquet.py

@no_api
def view(self) -> LakeQueryView:
    """Get a view for querying statements."""
    return self._lake.default_view()

TagStore

Key-value freshness tracking.

`ftm_lakehouse.storage.TagStore`

Bases: Tags

Key-value store for freshness tracking.

Tags are timestamps stored as key-value pairs, used to track when resources were last updated and determine if processing is needed.

Layout: tags/{tenant}/{key}

This store has the "tags/{tenant}" key prefix set, so clients must use relative paths from there.

Source code in ftm_lakehouse/storage/tags.py

class TagStore(AnyTags):
    """
    Key-value store for freshness tracking.

    Tags are timestamps stored as key-value pairs, used to track
    when resources were last updated and determine if processing
    is needed.

    Layout: tags/{tenant}/{key}

    This store has the "tags/{tenant}" key prefix set, so clients must use
    relative paths from there.
    """

    store = Store[datetime, Literal[False]]

    def __init__(self, uri: Uri, tenant: str | None = None) -> None:
        uri = join_uri(uri, path.tag(tenant=tenant))
        store = get_store(uri, raise_on_nonexist=False)
        super().__init__(store)

    def is_latest(self, key: str, dependencies: Iterable[str]) -> bool:
        """
        Check if the tag is more recent than all dependencies.

        Args:
            key: Tag key to check
            dependencies: Tag keys that this key depends on

        Returns:
            True if key is newer than all dependencies, False otherwise
        """
        last_updated = self.get(key)
        if last_updated is None:
            return False
        updated_dependencies = [ensure_utc(i) for i in map(self.get, dependencies) if i]
        if not updated_dependencies:
            return False
        last_updated = ensure_utc(last_updated)
        return all(last_updated > i for i in updated_dependencies)

    def set(self, key: str, timestamp: datetime | None = None) -> datetime:
        """Set a tag to the given timestamp (or now, in UTC)."""
        ts = timestamp or datetime.now(timezone.utc)
        self.put(key, ts)
        return ts

    def __repr__(self) -> str:
        return f"<{self.__class__.__name__}({mask_uri(self.store.uri)})>"

`is_latest(key, dependencies)`

Check if the tag is more recent than all dependencies.

Parameters:

Name	Type	Description	Default
`key`	`str`	Tag key to check	required
`dependencies`	`Iterable[str]`	Tag keys that this key depends on	required

Returns:

Type	Description
`bool`	True if key is newer than all dependencies, False otherwise

Source code in ftm_lakehouse/storage/tags.py

def is_latest(self, key: str, dependencies: Iterable[str]) -> bool:
    """
    Check if the tag is more recent than all dependencies.

    Args:
        key: Tag key to check
        dependencies: Tag keys that this key depends on

    Returns:
        True if key is newer than all dependencies, False otherwise
    """
    last_updated = self.get(key)
    if last_updated is None:
        return False
    updated_dependencies = [ensure_utc(i) for i in map(self.get, dependencies) if i]
    if not updated_dependencies:
        return False
    last_updated = ensure_utc(last_updated)
    return all(last_updated > i for i in updated_dependencies)

`set(key, timestamp=None)`

Set a tag to the given timestamp (or now, in UTC).

Source code in ftm_lakehouse/storage/tags.py

def set(self, key: str, timestamp: datetime | None = None) -> datetime:
    """Set a tag to the given timestamp (or now, in UTC)."""
    ts = timestamp or datetime.now(timezone.utc)
    self.put(key, ts)
    return ts

Layer 2: Storage

JournalStore

ftm_lakehouse.storage.JournalStore = SqlJournalStore module-attribute

ParquetStore

ftm_lakehouse.storage.ParquetStore

exists property

version property

append(batch)

compact()

destroy()

export_csv(key, q=None)

get(entity_id)

get_changed_entity_ids(since, schemata=None, prop=None)

get_statements(entity_id)

merge(grace_period_days=None)

query(q=None)

query_statements(q=None)

stats()

unlock()

vacuum(retention_hours=0)

view()

TagStore

ftm_lakehouse.storage.TagStore

is_latest(key, dependencies)

set(key, timestamp=None)

`ftm_lakehouse.storage.JournalStore = SqlJournalStore` `module-attribute`

`ftm_lakehouse.storage.ParquetStore`

`exists` `property`

`version` `property`

`append(batch)`

`compact()`

`destroy()`

`export_csv(key, q=None)`

`get(entity_id)`

`get_changed_entity_ids(since, schemata=None, prop=None)`

`get_statements(entity_id)`

`merge(grace_period_days=None)`

`query(q=None)`

`query_statements(q=None)`

`stats()`

`unlock()`

`vacuum(retention_hours=0)`

`view()`

`ftm_lakehouse.storage.TagStore`

`is_latest(key, dependencies)`

`set(key, timestamp=None)`