Layer 4: Operation

Multi-step workflow operations that coordinate across repositories.

Base Classes

`ftm_lakehouse.operation.base.DatasetJobOperation`

Bases: LakehouseApiMixin, Generic[DJ]

A (long-running) operation for a specific dataset that updates tags and checks dependencies for freshness to be able to skip this operation. The job result is stored after successful run.

Subclasses can either set class attributes target and dependencies, or override get_target() and get_dependencies() for dynamic values.

Source code in ftm_lakehouse/operation/base.py

class DatasetJobOperation(LakehouseApiMixin, Generic[DJ]):
    """
    A (long-running) operation for a specific dataset that updates tags and
    checks dependencies for freshness to be able to skip this operation. The job
    result is stored after successful run.

    Subclasses can either set class attributes `target` and `dependencies`,
    or override `get_target()` and `get_dependencies()` for dynamic values.
    """

    target: str = ""  # tag that gets touched after successful run
    dependencies: list[str] = []  # dependencies for freshness check
    _dataset: Dataset

    def __init__(
        self,
        job: DJ,
        archive: ArchiveRepository | None = None,
        entities: EntityRepository | None = None,
        documents: DocumentRepository | None = None,
        jobs: JobRepository | None = None,
        tags: TagStore | None = None,
        versions: VersionStore | None = None,
        uri: Uri | None = None,
    ) -> None:
        self.job = job
        self.log = job.log
        self.archive = archive or get_archive(job.dataset, uri)
        self.entities = entities or get_entities(job.dataset, uri)
        self.documents = documents or get_documents(job.dataset, uri)
        self.jobs = jobs or get_jobs(job.dataset, job.__class__, uri)
        self.tags = tags or get_tags(job.dataset, uri)
        self.versions = versions or get_versions(job.dataset, uri)
        super().__init__(uri or self.archive.uri)

    @classmethod
    def from_job(cls, job: DJ, dataset: Dataset) -> Self:
        """Create an operation instance from a job and Dataset.

        Args:
            job: The job model instance
            dataset: The Dataset providing repositories and storage

        Returns:
            Configured operation instance
        """
        instance = cls(
            job=job,
            archive=dataset.archive,
            entities=dataset.entities,
            documents=dataset.documents,
            jobs=get_jobs(dataset.name, job.__class__, dataset.uri),
            tags=dataset._tags,
            versions=dataset._versions,
        )
        instance._dataset = dataset
        return instance

    def get_target(self) -> str:
        """Return the target tag. Override for dynamic values."""
        return self.target

    def get_dependencies(self) -> list[str]:
        """Return the dependencies. Override for dynamic values."""
        return self.dependencies

    def handle(self, run: JobRun, *args, **kwargs) -> None:
        raise NotImplementedError

    @api_delegate("_api_run")
    def run(self, force: bool | None = False, *args, **kwargs) -> DJ:
        """Execute the handle function, force to run it regardless of freshness
        dependencies"""
        target = self.get_target()
        dependencies = self.get_dependencies()

        if not force:
            if target and dependencies:
                if self.tags.is_latest(target, dependencies):
                    self.job.log.info(
                        f"Already up-to-date: `{target}`, skipping ...",
                        target=target,
                        dependencies=dependencies,
                    )
                    self.job.stop()
                    return self.job

        # Execute: Store target tag and job result on successful context leave
        with self.jobs.run(self.job) as run, self.tags.touch(target) as now:
            self.job.log.info(
                f"Start `{target}` ...",
                target=target,
                dependencies=dependencies,
                started=now,
            )
            _ = self.handle(run, *args, force=force, **kwargs)
        self.log.info(
            f"Done `{target}`.",
            target=target,
            dependencies=dependencies,
            started=now,
            took=run.job.took,
            errors=run.job.errors,
        )
        result = self.jobs.latest()
        if result is not None:
            return result
        raise RuntimeError("Result is `None`")

    @require_api
    def _api_run(self, force: bool | None = False, *args, **kwargs) -> DJ:
        """Delegate run to remote api"""
        url = self._api.make_url("_api/operations")
        res = self._api.make_request(
            url,
            "POST",
            params={"force": force},
            json=self.job.model_dump(mode="json"),
        )
        return self.job.__class__(**res.json())

    def __repr__(self) -> str:
        return f"<{self.__class__.__name__}({self.job.dataset})>"

`from_job(job, dataset)` `classmethod`

Create an operation instance from a job and Dataset.

Parameters:

Name	Type	Description	Default
`job`	`DJ`	The job model instance	required
`dataset`	`Dataset`	The Dataset providing repositories and storage	required

Returns:

Type	Description
`Self`	Configured operation instance

Source code in ftm_lakehouse/operation/base.py

@classmethod
def from_job(cls, job: DJ, dataset: Dataset) -> Self:
    """Create an operation instance from a job and Dataset.

    Args:
        job: The job model instance
        dataset: The Dataset providing repositories and storage

    Returns:
        Configured operation instance
    """
    instance = cls(
        job=job,
        archive=dataset.archive,
        entities=dataset.entities,
        documents=dataset.documents,
        jobs=get_jobs(dataset.name, job.__class__, dataset.uri),
        tags=dataset._tags,
        versions=dataset._versions,
    )
    instance._dataset = dataset
    return instance

`get_dependencies()`

Return the dependencies. Override for dynamic values.

Source code in ftm_lakehouse/operation/base.py

def get_dependencies(self) -> list[str]:
    """Return the dependencies. Override for dynamic values."""
    return self.dependencies

`get_target()`

Return the target tag. Override for dynamic values.

Source code in ftm_lakehouse/operation/base.py

def get_target(self) -> str:
    """Return the target tag. Override for dynamic values."""
    return self.target

`run(force=False, *args, **kwargs)`

Execute the handle function, force to run it regardless of freshness dependencies

Source code in ftm_lakehouse/operation/base.py

@api_delegate("_api_run")
def run(self, force: bool | None = False, *args, **kwargs) -> DJ:
    """Execute the handle function, force to run it regardless of freshness
    dependencies"""
    target = self.get_target()
    dependencies = self.get_dependencies()

    if not force:
        if target and dependencies:
            if self.tags.is_latest(target, dependencies):
                self.job.log.info(
                    f"Already up-to-date: `{target}`, skipping ...",
                    target=target,
                    dependencies=dependencies,
                )
                self.job.stop()
                return self.job

    # Execute: Store target tag and job result on successful context leave
    with self.jobs.run(self.job) as run, self.tags.touch(target) as now:
        self.job.log.info(
            f"Start `{target}` ...",
            target=target,
            dependencies=dependencies,
            started=now,
        )
        _ = self.handle(run, *args, force=force, **kwargs)
    self.log.info(
        f"Done `{target}`.",
        target=target,
        dependencies=dependencies,
        started=now,
        took=run.job.took,
        errors=run.job.errors,
    )
    result = self.jobs.latest()
    if result is not None:
        return result
    raise RuntimeError("Result is `None`")

CrawlOperation

Batch file ingestion from a source location.

`ftm_lakehouse.operation.crawl.CrawlJob`

Bases: DatasetJobModel

Job model for crawl operations.

Tracks the state and configuration of a crawl job.

Attributes:

Name	Type	Description
`uri`	`Uri`	Source location URI to crawl
`prefix`	`str \| None`	Include only keys with this prefix
`exclude_prefix`	`str \| None`	Exclude keys with this prefix
`glob`	`str \| None`	Include only keys matching this glob pattern
`exclude_glob`	`str \| None`	Exclude keys matching this glob pattern

Source code in ftm_lakehouse/operation/crawl.py

class CrawlJob(DatasetJobModel):
    """
    Job model for crawl operations.

    Tracks the state and configuration of a crawl job.

    Attributes:
        uri: Source location URI to crawl
        prefix: Include only keys with this prefix
        exclude_prefix: Exclude keys with this prefix
        glob: Include only keys matching this glob pattern
        exclude_glob: Exclude keys matching this glob pattern
    """

    uri: Uri
    prefix: str | None = None
    exclude_prefix: str | None = None
    glob: str | None = None
    exclude_glob: str | None = None
    make_entities: bool = False
    existing: HandleExistingMode | None = HandleExistingMode.skip_path

`ftm_lakehouse.operation.CrawlOperation`

Bases: DatasetJobOperation[CrawlJob]

Crawl workflow that archives files and creates entities.

Iterates through files in a source store, archives them to the file repository, and creates corresponding entities in the entities repository.

Example

from ftm_lakehouse.operation import CrawlOperation, CrawlJob

job = CrawlJob.make(
    uri="s3://bucket/documents",
    dataset="my_dataset",
    glob="*.pdf"
)
op = CrawlOperation(job=job)
result = op.run()
print(f"Crawled {result.done} files")

Source code in ftm_lakehouse/operation/crawl.py

class CrawlOperation(DatasetJobOperation[CrawlJob]):
    """
    Crawl workflow that archives files and creates entities.

    Iterates through files in a source store, archives them to the
    file repository, and creates corresponding entities in the
    entities repository.

    Example:
        ```python
        from ftm_lakehouse.operation import CrawlOperation, CrawlJob

        job = CrawlJob.make(
            uri="s3://bucket/documents",
            dataset="my_dataset",
            glob="*.pdf"
        )
        op = CrawlOperation(job=job)
        result = op.run()
        print(f"Crawled {result.done} files")
        ```
    """

    target = tag.OP_CRAWL

    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.source = get_store(self.job.uri)
        if self.source.is_http:
            backend_config = ensure_dict(self.source.backend_config)
            backend_config["client_kwargs"] = {
                **ensure_dict(backend_config.get("client_kwargs")),
                "timeout": aiohttp.ClientTimeout(total=3600 * 24),
            }
            self.source.backend_config = backend_config

    def get_uris(self) -> Generator[str, None, None]:
        """
        Generate file uris to crawl.

        Applies prefix, glob, and exclude filters to the source store.

        Yields:
            File uris to be crawled
        """
        self.log.info(f"Crawling `{mask_uri(self.job.uri)}` ...")
        for key in self.source.iterate_keys(
            prefix=self.job.prefix,
            exclude_prefix=self.job.exclude_prefix,
            glob=self.job.glob,
        ):
            if self.job.exclude_glob and fnmatch(key, self.job.exclude_glob):
                continue
            self.job.pending += 1
            self.job.touch()
            yield key

    def handle_crawl(self, uri: str, run: JobRun[CrawlJob]) -> datetime:
        """
        Handle a single crawl task.

        Archives the file and creates a corresponding entity.

        Args:
            uri: File uri to crawl
            run: Current job run context

        Returns:
            Timestamp when the task was processed
        """
        now = datetime.now()

        self.log.info(f"Crawling `{uri}` ...", source=mask_uri(self.source.uri))
        checksum = None
        if self.source.is_local:
            checksum = self.source.checksum(uri, algorithm=CHECKSUM_ALGORITHM)
        if not self._should_skip(uri, checksum):
            file = self.archive.store(
                self.source.to_uri(uri),
                checksum=checksum,
                key=uri,
                origin=tag.CRAWL_ORIGIN,
            )
            if self.job.make_entities:
                self.entities.add_many(file.make_entities(), tag.CRAWL_ORIGIN)
            run.job.done += 1
        return now

    def handle(self, run: JobRun, *args, **kwargs) -> None:
        for ix, task in enumerate(self.get_uris(), 1):
            if ix % 1000 == 0:
                self.log.info(
                    f"Handling task {ix} ...",
                    pending=self.job.pending,
                    done=self.job.done,
                )
                run.save()
            self.handle_crawl(task, run)
            run.job.pending -= 1
            run.job.touch()
        if self.job.make_entities:
            self.entities.flush()

    def _should_skip(self, uri: Uri, checksum: str | None) -> bool:
        if self.job.existing is None:
            return False
        if self.job.existing == HandleExistingMode.overwrite:
            return False
        if checksum is None:
            return False
        if self.job.existing == HandleExistingMode.skip_checksum:
            return self.archive.exists(checksum)
        if self.job.existing == HandleExistingMode.skip_path:
            if self.archive.exists(checksum):
                for file in self.archive.get_all_files(checksum):
                    if file.key == str(uri):
                        return True
        return False

`get_uris()`

Generate file uris to crawl.

Applies prefix, glob, and exclude filters to the source store.

Yields:

Type	Description
`str`	File uris to be crawled

Source code in ftm_lakehouse/operation/crawl.py

def get_uris(self) -> Generator[str, None, None]:
    """
    Generate file uris to crawl.

    Applies prefix, glob, and exclude filters to the source store.

    Yields:
        File uris to be crawled
    """
    self.log.info(f"Crawling `{mask_uri(self.job.uri)}` ...")
    for key in self.source.iterate_keys(
        prefix=self.job.prefix,
        exclude_prefix=self.job.exclude_prefix,
        glob=self.job.glob,
    ):
        if self.job.exclude_glob and fnmatch(key, self.job.exclude_glob):
            continue
        self.job.pending += 1
        self.job.touch()
        yield key

`handle_crawl(uri, run)`

Handle a single crawl task.

Archives the file and creates a corresponding entity.

Parameters:

Name	Type	Description	Default
`uri`	`str`	File uri to crawl	required
`run`	`JobRun[CrawlJob]`	Current job run context	required

Returns:

Type	Description
`datetime`	Timestamp when the task was processed

Source code in ftm_lakehouse/operation/crawl.py

def handle_crawl(self, uri: str, run: JobRun[CrawlJob]) -> datetime:
    """
    Handle a single crawl task.

    Archives the file and creates a corresponding entity.

    Args:
        uri: File uri to crawl
        run: Current job run context

    Returns:
        Timestamp when the task was processed
    """
    now = datetime.now()

    self.log.info(f"Crawling `{uri}` ...", source=mask_uri(self.source.uri))
    checksum = None
    if self.source.is_local:
        checksum = self.source.checksum(uri, algorithm=CHECKSUM_ALGORITHM)
    if not self._should_skip(uri, checksum):
        file = self.archive.store(
            self.source.to_uri(uri),
            checksum=checksum,
            key=uri,
            origin=tag.CRAWL_ORIGIN,
        )
        if self.job.make_entities:
            self.entities.add_many(file.make_entities(), tag.CRAWL_ORIGIN)
        run.job.done += 1
    return now

Export Operations