Layer 4: Operation

Multi-step workflow operations that coordinate across repositories.

Base Classes

`ftm_lakehouse.operation.base.DatasetJobOperation`

Bases: LakehouseApiMixin, Generic[DJ]

A (long-running) operation for a specific dataset that updates tags and checks dependencies for freshness to be able to skip this operation. The job result is stored after successful run.

Repositories are resolved through the LRU-cached factories, so an operation shares its repository instances with every other path that addresses the same dataset.

Subclasses can either set class attributes target and dependencies, or override get_target() and get_dependencies() for dynamic values.

Source code in ftm_lakehouse/operation/base.py

class DatasetJobOperation(LakehouseApiMixin, Generic[DJ]):
    """
    A (long-running) operation for a specific dataset that updates tags and
    checks dependencies for freshness to be able to skip this operation. The job
    result is stored after successful run.

    Repositories are resolved through the LRU-cached factories, so an
    operation shares its repository instances with every other path that
    addresses the same dataset.

    Subclasses can either set class attributes `target` and `dependencies`,
    or override `get_target()` and `get_dependencies()` for dynamic values.
    """

    target: str = ""  # tag that gets touched after successful run
    dependencies: list[str] = []  # dependencies for freshness check
    _dataset: Dataset

    def __init__(self, job: DJ, uri: Uri | None = None) -> None:
        self.job = job
        self.log = job.log
        self.archive = get_archive(job.dataset, uri)
        self.entities = get_entities(job.dataset, uri)
        self.documents = get_documents(job.dataset, uri)
        self.jobs = get_jobs(job.dataset, job.__class__, uri)
        self.tags = get_tags(job.dataset, uri)
        self.versions = get_versions(job.dataset, uri)
        super().__init__(uri or self.archive.uri)

    @classmethod
    def from_job(cls, job: DJ, dataset: Dataset) -> Self:
        """Create an operation bound to ``dataset``.

        Args:
            job: The job model instance
            dataset: The Dataset – provides the storage uri and stays bound
                as ``_dataset`` for operations that need the full handle
                (e.g. ``make`` / the index export).

        Returns:
            Configured operation instance
        """
        instance = cls(job, uri=dataset.uri)
        instance._dataset = dataset
        return instance

    def get_target(self) -> str:
        """Return the target tag. Override for dynamic values."""
        return self.target

    def get_dependencies(self) -> list[str]:
        """Return the dependencies. Override for dynamic values."""
        return self.dependencies

    def handle(self, run: JobRun, *args, **kwargs) -> None:
        raise NotImplementedError

    def _run_local(self, force: bool | None = False, *args, **kwargs) -> DJ:
        """Core run logic – orchestration + handle()."""
        target = self.get_target()
        dependencies = self.get_dependencies()

        if not force:
            if target and dependencies:
                if self.tags.is_latest(target, dependencies):
                    self.job.log.info(
                        f"Already up-to-date: `{target}`, skipping ...",
                        target=target,
                        dependencies=dependencies,
                    )
                    self.job.stop()
                    return self.job

        # Execute: Store target tag and job result on successful context leave
        with self.jobs.run(self.job) as run, self.tags.touch(target) as now:
            self.job.log.info(
                f"Start `{target}` ...",
                target=target,
                dependencies=dependencies,
                started=now,
            )
            _ = self.handle(run, *args, force=force, **kwargs)
        self.log.info(
            f"Done `{target}`.",
            target=target,
            dependencies=dependencies,
            started=now,
            took=run.job.took,
            errors=run.job.errors,
        )
        return run.job

    @api_delegate("_api_run")
    def run(self, force: bool | None = False, *args, **kwargs) -> DJ:
        """Execute the handle function, force to run it regardless of freshness
        dependencies"""
        return self._run_local(force, *args, **kwargs)

    @require_api
    def _api_run(self, force: bool | None = False, *args, **kwargs) -> DJ:
        """Delegate run to remote api"""
        url = self._api.make_url("_api/operations")
        res = self._api.make_request(
            url,
            "POST",
            params={"force": force},
            json=self.job.model_dump(mode="json"),
        )
        return self.job.__class__(**res.json())

    def __repr__(self) -> str:
        return f"<{self.__class__.__name__}({self.job.dataset})>"

`from_job(job, dataset)` `classmethod`

Create an operation bound to dataset.

Parameters:

Name	Type	Description	Default
`job`	`DJ`	The job model instance	required
`dataset`	`Dataset`	The Dataset – provides the storage uri and stays bound as `_dataset` for operations that need the full handle (e.g. `make` / the index export).	required

Returns:

Type	Description
`Self`	Configured operation instance

Source code in ftm_lakehouse/operation/base.py

@classmethod
def from_job(cls, job: DJ, dataset: Dataset) -> Self:
    """Create an operation bound to ``dataset``.

    Args:
        job: The job model instance
        dataset: The Dataset – provides the storage uri and stays bound
            as ``_dataset`` for operations that need the full handle
            (e.g. ``make`` / the index export).

    Returns:
        Configured operation instance
    """
    instance = cls(job, uri=dataset.uri)
    instance._dataset = dataset
    return instance

`get_dependencies()`

Return the dependencies. Override for dynamic values.

Source code in ftm_lakehouse/operation/base.py

def get_dependencies(self) -> list[str]:
    """Return the dependencies. Override for dynamic values."""
    return self.dependencies

`get_target()`

Return the target tag. Override for dynamic values.

Source code in ftm_lakehouse/operation/base.py

def get_target(self) -> str:
    """Return the target tag. Override for dynamic values."""
    return self.target

`run(force=False, *args, **kwargs)`

Execute the handle function, force to run it regardless of freshness dependencies

Source code in ftm_lakehouse/operation/base.py

@api_delegate("_api_run")
def run(self, force: bool | None = False, *args, **kwargs) -> DJ:
    """Execute the handle function, force to run it regardless of freshness
    dependencies"""
    return self._run_local(force, *args, **kwargs)

CrawlOperation

Batch file ingestion from a source location.

`ftm_lakehouse.operation.crawl.CrawlJob`

Bases: DatasetJobModel

Job model for crawl operations.

Tracks the state and configuration of a crawl job.

Attributes:

Name	Type	Description
`uri`	`Uri`	Source location URI to crawl
`prefix`	`str \| None`	Include only keys with this prefix
`exclude_prefix`	`str \| None`	Exclude keys with this prefix
`glob`	`str \| None`	Include only keys matching this glob pattern
`exclude_glob`	`str \| None`	Exclude keys matching this glob pattern

Source code in ftm_lakehouse/operation/crawl.py

class CrawlJob(DatasetJobModel):
    """
    Job model for crawl operations.

    Tracks the state and configuration of a crawl job.

    Attributes:
        uri: Source location URI to crawl
        prefix: Include only keys with this prefix
        exclude_prefix: Exclude keys with this prefix
        glob: Include only keys matching this glob pattern
        exclude_glob: Exclude keys matching this glob pattern
    """

    uri: Uri
    prefix: str | None = None
    exclude_prefix: str | None = None
    glob: str | None = None
    exclude_glob: str | None = None
    make_entities: bool = False
    existing: HandleExistingMode | None = HandleExistingMode.skip_path

`ftm_lakehouse.operation.CrawlOperation`

Bases: DatasetJobOperation[CrawlJob]

Crawl workflow that archives files and creates entities.

Iterates through files in a source store, archives them to the file repository, and creates corresponding entities in the entities repository.

Example

from ftm_lakehouse.operation import CrawlOperation, CrawlJob

job = CrawlJob.make(
    uri="s3://bucket/documents",
    dataset="my_dataset",
    glob="*.pdf"
)
op = CrawlOperation(job=job)
result = op.run()
print(f"Crawled {result.done} files")

Source code in ftm_lakehouse/operation/crawl.py

class CrawlOperation(DatasetJobOperation[CrawlJob]):
    """
    Crawl workflow that archives files and creates entities.

    Iterates through files in a source store, archives them to the
    file repository, and creates corresponding entities in the
    entities repository.

    Example:
        ```python
        from ftm_lakehouse.operation import CrawlOperation, CrawlJob

        job = CrawlJob.make(
            uri="s3://bucket/documents",
            dataset="my_dataset",
            glob="*.pdf"
        )
        op = CrawlOperation(job=job)
        result = op.run()
        print(f"Crawled {result.done} files")
        ```
    """

    target = tag.OP_CRAWL

    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.source = get_store(self.job.uri)
        if self.source.is_http:
            backend_config = ensure_dict(self.source.backend_config)
            backend_config["client_kwargs"] = {
                **ensure_dict(backend_config.get("client_kwargs")),
                "timeout": aiohttp.ClientTimeout(total=3600 * 24),
            }
            self.source.backend_config = backend_config

    def get_uris(self) -> Generator[str, None, None]:
        """
        Generate file uris to crawl.

        Applies prefix, glob, and exclude filters to the source store.

        Yields:
            File uris to be crawled
        """
        self.log.info(f"Crawling `{mask_uri(self.job.uri)}` ...")
        for key in self.source.iterate_keys(
            prefix=self.job.prefix,
            exclude_prefix=self.job.exclude_prefix,
            glob=self.job.glob,
        ):
            if self.job.exclude_glob and fnmatch(key, self.job.exclude_glob):
                continue
            self.job.pending += 1
            self.job.touch()
            yield key

    def handle_crawl(self, uri: str, run: JobRun[CrawlJob]) -> datetime:
        """
        Handle a single crawl task.

        Archives the file and creates a corresponding entity.

        Args:
            uri: File uri to crawl
            run: Current job run context

        Returns:
            Timestamp when the task was processed
        """
        now = datetime.now(timezone.utc)

        self.log.info(f"Crawling `{uri}` ...", source=mask_uri(self.source.uri))
        checksum = None
        if self.source.is_local:
            checksum = self.source.checksum(uri, algorithm=CHECKSUM_ALGORITHM)
        if not self._should_skip(uri, checksum):
            file = self.archive.store(
                self.source.to_uri(uri),
                checksum=checksum,
                key=uri,
                origin=tag.CRAWL_ORIGIN,
            )
            if self.job.make_entities:
                self.entities.add_many(file.make_entities(), tag.CRAWL_ORIGIN)
            run.job.done += 1
        return now

    def handle(self, run: JobRun, *args, **kwargs) -> None:
        for ix, task in enumerate(self.get_uris(), 1):
            if ix % 1000 == 0:
                self.log.info(
                    f"Handling task {ix} ...",
                    pending=self.job.pending,
                    done=self.job.done,
                )
                run.save()
            self.handle_crawl(task, run)
            run.job.pending -= 1
            run.job.touch()
        if self.job.make_entities:
            self.entities.flush()

    def _should_skip(self, uri: Uri, checksum: str | None) -> bool:
        if self.job.existing is None:
            return False
        if self.job.existing == HandleExistingMode.overwrite:
            return False
        if checksum is None:
            return False
        if self.job.existing == HandleExistingMode.skip_checksum:
            return self.archive.exists(checksum)
        if self.job.existing == HandleExistingMode.skip_path:
            if self.archive.exists(checksum):
                for file in self.archive.get_all_files(checksum):
                    if file.key == str(uri):
                        return True
        return False

    def _api_run(self, force: bool | None = False, *args, **kwargs) -> CrawlJob:
        """Crawl always runs locally – source files aren't on the API server."""
        return self._run_local(force, *args, **kwargs)

`get_uris()`

Generate file uris to crawl.

Applies prefix, glob, and exclude filters to the source store.

Yields:

Type	Description
`str`	File uris to be crawled

Source code in ftm_lakehouse/operation/crawl.py

def get_uris(self) -> Generator[str, None, None]:
    """
    Generate file uris to crawl.

    Applies prefix, glob, and exclude filters to the source store.

    Yields:
        File uris to be crawled
    """
    self.log.info(f"Crawling `{mask_uri(self.job.uri)}` ...")
    for key in self.source.iterate_keys(
        prefix=self.job.prefix,
        exclude_prefix=self.job.exclude_prefix,
        glob=self.job.glob,
    ):
        if self.job.exclude_glob and fnmatch(key, self.job.exclude_glob):
            continue
        self.job.pending += 1
        self.job.touch()
        yield key

`handle_crawl(uri, run)`

Handle a single crawl task.

Archives the file and creates a corresponding entity.

Parameters:

Name	Type	Description	Default
`uri`	`str`	File uri to crawl	required
`run`	`JobRun[CrawlJob]`	Current job run context	required

Returns:

Type	Description
`datetime`	Timestamp when the task was processed

Source code in ftm_lakehouse/operation/crawl.py

def handle_crawl(self, uri: str, run: JobRun[CrawlJob]) -> datetime:
    """
    Handle a single crawl task.

    Archives the file and creates a corresponding entity.

    Args:
        uri: File uri to crawl
        run: Current job run context

    Returns:
        Timestamp when the task was processed
    """
    now = datetime.now(timezone.utc)

    self.log.info(f"Crawling `{uri}` ...", source=mask_uri(self.source.uri))
    checksum = None
    if self.source.is_local:
        checksum = self.source.checksum(uri, algorithm=CHECKSUM_ALGORITHM)
    if not self._should_skip(uri, checksum):
        file = self.archive.store(
            self.source.to_uri(uri),
            checksum=checksum,
            key=uri,
            origin=tag.CRAWL_ORIGIN,
        )
        if self.job.make_entities:
            self.entities.add_many(file.make_entities(), tag.CRAWL_ORIGIN)
        run.job.done += 1
    return now

ExportOperation

One operation for all exports, dispatched by ExportKind: statements (exports/statements.csv), entities (entities.ftm.json), documents (exports/documents.csv), statistics (exports/statistics.json), index (index.json).

`ftm_lakehouse.operation.export.ExportKind`

Bases: StrEnum

The available dataset exports.

Source code in ftm_lakehouse/operation/export.py

class ExportKind(StrEnum):
    """The available dataset exports."""

    statements = "statements"
    entities = "entities"
    documents = "documents"
    statistics = "statistics"
    index = "index"  # type: ignore[assignment]  # shadows str.index, fine for enums

`ftm_lakehouse.operation.export.ExportJob`

Bases: DatasetJobModel

Job model for all export kinds.

Source code in ftm_lakehouse/operation/export.py

class ExportJob(DatasetJobModel):
    """Job model for all export kinds."""

    kind: ExportKind
    make_diff: bool = True
    """Also export a delta diff file (``entities`` / ``documents`` kinds)."""
    public_url_prefix: HttpUrlStr | None = None
    """Override the public url prefix (``documents`` kind)."""

    def get_public_prefix(self) -> str | None:
        if self.public_url_prefix:
            return self.public_url_prefix
        if settings.public_url_prefix:
            return render(settings.public_url_prefix, {"dataset": self.dataset})

`make_diff = True` `class-attribute` `instance-attribute`

Also export a delta diff file (entities / documents kinds).

`public_url_prefix = None` `class-attribute` `instance-attribute`

Override the public url prefix (documents kind).

`ftm_lakehouse.operation.ExportOperation`

Bases: DatasetJobOperation[ExportJob]

Export the dataset, dispatched by job.kind via :data:EXPORTS.

Checks if the journal needs to be flushed first. Skips if the last export is newer than the last statements update (per-kind freshness target / dependencies from the spec table).

Source code in ftm_lakehouse/operation/export.py

class ExportOperation(DatasetJobOperation[ExportJob]):
    """Export the dataset, dispatched by ``job.kind`` via :data:`EXPORTS`.

    Checks if the journal needs to be flushed first. Skips if the last
    export is newer than the last statements update (per-kind freshness
    target / dependencies from the spec table).
    """

    @property
    def spec(self) -> ExportSpec:
        return EXPORTS[self.job.kind]

    def get_target(self) -> str:
        return self.spec.target

    def get_dependencies(self) -> list[str]:
        return list(self.spec.dependencies)

    def ensure_flush(self) -> bool:
        if not self.tags.is_latest(tag.JOURNAL_FLUSHED, [tag.JOURNAL_UPDATED]):
            self.entities.flush()
        if not self.entities._statements.exists:
            self.log.info(
                "Statement store empty, skipping ...",
                uri=mask_uri(self.entities._statements.uri),
            )
            return False
        return True

    def _get_fresh_statements_csv(self) -> str | None:
        """Return statements.csv URI if it's at least as fresh as the store.

        The statements export's freshness tag is its target key
        (``path.EXPORTS_STATEMENTS``), touched after a successful run.
        """
        store = self.entities._store
        if not store.exists(path.EXPORTS_STATEMENTS):
            return None
        if self.tags.is_latest(path.EXPORTS_STATEMENTS, [tag.STATEMENTS_UPDATED]):
            return store.to_uri(path.EXPORTS_STATEMENTS)
        return None

    def handle(self, run: JobRun, *args: Any, **kwargs: Any) -> None:
        has_statements = self.ensure_flush()
        if self.spec.requires_statements and not has_statements:
            return
        self.spec.handler(self, run, **kwargs)
        run.job.done = 1

MappingOperation

Process CSV-to-entity mapping configurations.

`ftm_lakehouse.operation.mapping.MappingJob`

Bases: DatasetJobModel

Source code in ftm_lakehouse/operation/mapping.py

class MappingJob(DatasetJobModel):
    content_hash: str
    entities: int = 0

`ftm_lakehouse.operation.MappingOperation`

Bases: DatasetJobOperation[MappingJob]

Mapping workflow that transforms a CSV file into entities.

Processes a single archived CSV file (identified by content_hash) using its mapping configuration to generate FollowTheMoney entities, which are written to the entity repository.

Example

from ftm_lakehouse.operation import MappingOperation, MappingJob

job = MappingJob.make(
    dataset="my_dataset",
    content_hash="5a6acf229ba576d9a40b09292595658bbb74ef56",
)
op = MappingOperation(job=job)
result = op.run()
print(f"Generated {result.done} entities")

Source code in ftm_lakehouse/operation/mapping.py

class MappingOperation(DatasetJobOperation[MappingJob]):
    """
    Mapping workflow that transforms a CSV file into entities.

    Processes a single archived CSV file (identified by content_hash)
    using its mapping configuration to generate FollowTheMoney entities,
    which are written to the entity repository.

    Example:
        ```python
        from ftm_lakehouse.operation import MappingOperation, MappingJob

        job = MappingJob.make(
            dataset="my_dataset",
            content_hash="5a6acf229ba576d9a40b09292595658bbb74ef56",
        )
        op = MappingOperation(job=job)
        result = op.run()
        print(f"Generated {result.done} entities")
        ```
    """

    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.mappings = get_mappings(self.job.dataset, self.archive.uri)

    def get_target(self) -> str:
        return tag.mapping_tag(self.job.content_hash)

    def get_dependencies(self) -> list[str]:
        return [path.mapping(self.job.content_hash)]

    def handle(self, run: JobRun[MappingJob], *args, **kwargs) -> None:
        """
        Process the mapping configuration and store generated entities.

        Skips processing if the mapping output is already up-to-date
        relative to the mapping config.
        """
        origin = mapping_origin(self.job.content_hash)
        mapping = self.mappings.get(self.job.content_hash)
        file = self.archive.get_file(self.job.content_hash)
        with self.archive.local_path(file.checksum) as csv_path:
            with self.entities.writer(origin=origin) as bulk:
                for entity in map_entities(mapping, csv_path):
                    bulk.add_entity(entity)
                    run.job.done += 1
        self.entities.flush()

`handle(run, *args, **kwargs)`

Process the mapping configuration and store generated entities.

Skips processing if the mapping output is already up-to-date relative to the mapping config.

Source code in ftm_lakehouse/operation/mapping.py

def handle(self, run: JobRun[MappingJob], *args, **kwargs) -> None:
    """
    Process the mapping configuration and store generated entities.

    Skips processing if the mapping output is already up-to-date
    relative to the mapping config.
    """
    origin = mapping_origin(self.job.content_hash)
    mapping = self.mappings.get(self.job.content_hash)
    file = self.archive.get_file(self.job.content_hash)
    with self.archive.local_path(file.checksum) as csv_path:
        with self.entities.writer(origin=origin) as bulk:
            for entity in map_entities(mapping, csv_path):
                bulk.add_entity(entity)
                run.job.done += 1
    self.entities.flush()

OptimizeOperation

Optimize the parquet statement store in one pass: merge (per-partition rewrite that collapses duplicates, folds first_seen to the min, drops tombstones older than the grace cutoff per LAKEHOUSE_GRACE_PERIOD_DAYS), compact (bin-pack small files) and vacuum (delete obsolete files). Each step acquires the dataset-wide write fence (.LOCK).

`ftm_lakehouse.operation.maintenance.OptimizeJob`

Bases: DatasetJobModel

Source code in ftm_lakehouse/operation/maintenance.py

class OptimizeJob(DatasetJobModel):
    retention_hours: int = 0
    """Vacuum: retain obsolete files newer than this many hours."""
    grace_period_days: int | None = None
    """Merge: override ``LAKEHOUSE_GRACE_PERIOD_DAYS`` for tombstone reaping."""

`grace_period_days = None` `class-attribute` `instance-attribute`

Merge: override LAKEHOUSE_GRACE_PERIOD_DAYS for tombstone reaping.

`retention_hours = 0` `class-attribute` `instance-attribute`

Vacuum: retain obsolete files newer than this many hours.

`ftm_lakehouse.operation.OptimizeOperation`

Bases: DatasetJobOperation[OptimizeJob]

Optimize the parquet statement store: merge, compact, vacuum.

For each (shard, bucket, origin) partition: keep the most-recent row per statement id, fold first_seen down to the minimum, drop tombstones older than the grace period – then bin-pack small files and delete obsolete ones. Each step is held under the dataset write fence.

Source code in ftm_lakehouse/operation/maintenance.py

class OptimizeOperation(DatasetJobOperation[OptimizeJob]):
    """Optimize the parquet statement store: merge, compact, vacuum.

    For each ``(shard, bucket, origin)`` partition: keep the most-recent row
    per statement id, fold ``first_seen`` down to the minimum, drop tombstones
    older than the grace period – then bin-pack small files and delete
    obsolete ones. Each step is held under the dataset write fence.
    """

    target = tag.STATEMENTS_OPTIMIZED
    dependencies = [tag.STATEMENTS_UPDATED]

    def handle(self, run: JobRun[OptimizeJob], *args, **kwargs) -> None:
        store = self.entities._statements
        store.merge(run.job.grace_period_days)
        run.job.done += 1
        run.save()
        store.compact()
        run.job.done += 1
        run.save()
        store.vacuum(retention_hours=run.job.retention_hours)
        run.job.done += 1

MakeOperation

Full workflow: flush journal + all exports.

`ftm_lakehouse.operation.make.MakeJob`

Bases: DatasetJobModel

Source code in ftm_lakehouse/operation/make.py

class MakeJob(DatasetJobModel):
    pass

`ftm_lakehouse.operation.MakeOperation`

Bases: DatasetJobOperation[MakeJob]

Source code in ftm_lakehouse/operation/make.py

class MakeOperation(DatasetJobOperation[MakeJob]):
    target = tag.OP_MAKE
    dependencies = [tag.JOURNAL_UPDATED, tag.STATEMENTS_UPDATED]

    def handle(self, run: JobRun, *args, **kwargs) -> None:
        force = kwargs.get("force", False)
        ds = self._dataset
        ds.get_entities().flush()
        for kind in ExportKind:
            job = ExportJob.make(dataset=ds.name, kind=kind)
            ExportOperation.from_job(job, ds).run(force=force)
        run.job.done = 1

DownloadArchiveOperation

Export archive files to their original paths.

`ftm_lakehouse.operation.download.DownloadArchiveJob`

Bases: DatasetJobModel

Source code in ftm_lakehouse/operation/download.py

class DownloadArchiveJob(DatasetJobModel):
    target: Uri
    skipped: int = 0

`ftm_lakehouse.operation.DownloadArchiveOperation`

Bases: DatasetJobOperation[DownloadArchiveJob]

Download the archive files to a target transforming into nice paths based on exported documents.csv

Source code in ftm_lakehouse/operation/download.py

class DownloadArchiveOperation(DatasetJobOperation[DownloadArchiveJob]):
    """
    Download the archive files to a target transforming into nice paths based on
    exported documents.csv
    """

    target = tag.OP_DOWNLOAD_ARCHIVE
    dependencies = [path.EXPORTS_DOCUMENTS]

    def handle(self, run: JobRun[DownloadArchiveJob], *args, **kwargs) -> None:
        target = get_store(run.job.target)
        self.log.info(
            "Downloading archive ...",
            target=mask_uri(target.uri),
            documents=mask_uri(self.documents.csv_uri),
        )
        for document in self.documents.stream():
            if target.exists(document.relative_path):
                self.log.debug(
                    f"Skipping `{document.relative_path}`, already exists.",
                    checksum=document.checksum,
                    source=mask_uri(self.archive.uri),
                    target=mask_uri(target.uri),
                )
                run.job.skipped += 1
                continue

            self.log.info(
                f"Downloading `{document.relative_path}` ...",
                checksum=document.checksum,
                source=mask_uri(self.archive.uri),
                target=mask_uri(target.uri),
            )
            with target.open(document.relative_path, "wb") as o:
                with self.archive.open(document.checksum) as i:
                    stream(i, o, CHUNK_SIZE_LARGE)
            run.job.done += 1

Layer 4: Operation

Base Classes

ftm_lakehouse.operation.base.DatasetJobOperation

from_job(job, dataset) classmethod

get_dependencies()

get_target()

run(force=False, *args, **kwargs)

CrawlOperation

ftm_lakehouse.operation.crawl.CrawlJob

ftm_lakehouse.operation.CrawlOperation

get_uris()

handle_crawl(uri, run)

ExportOperation

ftm_lakehouse.operation.export.ExportKind

ftm_lakehouse.operation.export.ExportJob

make_diff = True class-attribute instance-attribute

public_url_prefix = None class-attribute instance-attribute

ftm_lakehouse.operation.ExportOperation

MappingOperation

ftm_lakehouse.operation.mapping.MappingJob

ftm_lakehouse.operation.MappingOperation

handle(run, *args, **kwargs)

OptimizeOperation

ftm_lakehouse.operation.maintenance.OptimizeJob

grace_period_days = None class-attribute instance-attribute

retention_hours = 0 class-attribute instance-attribute

ftm_lakehouse.operation.OptimizeOperation

MakeOperation

ftm_lakehouse.operation.make.MakeJob

ftm_lakehouse.operation.MakeOperation

DownloadArchiveOperation

ftm_lakehouse.operation.download.DownloadArchiveJob

ftm_lakehouse.operation.DownloadArchiveOperation

`ftm_lakehouse.operation.base.DatasetJobOperation`

`from_job(job, dataset)` `classmethod`

`get_dependencies()`

`get_target()`

`run(force=False, *args, **kwargs)`

`ftm_lakehouse.operation.crawl.CrawlJob`

`ftm_lakehouse.operation.CrawlOperation`

`get_uris()`

`handle_crawl(uri, run)`

`ftm_lakehouse.operation.export.ExportKind`

`ftm_lakehouse.operation.export.ExportJob`

`make_diff = True` `class-attribute` `instance-attribute`

`public_url_prefix = None` `class-attribute` `instance-attribute`

`ftm_lakehouse.operation.ExportOperation`

`ftm_lakehouse.operation.mapping.MappingJob`

`ftm_lakehouse.operation.MappingOperation`

`handle(run, *args, **kwargs)`

`ftm_lakehouse.operation.maintenance.OptimizeJob`

`grace_period_days = None` `class-attribute` `instance-attribute`

`retention_hours = 0` `class-attribute` `instance-attribute`

`ftm_lakehouse.operation.OptimizeOperation`

`ftm_lakehouse.operation.make.MakeJob`

`ftm_lakehouse.operation.MakeOperation`

`ftm_lakehouse.operation.download.DownloadArchiveJob`

`ftm_lakehouse.operation.DownloadArchiveOperation`