Skip to content

Ingestors

SevenZipIngestor

ingestors.packages.SevenZipIngestor

File types

  • application/x-7z-compressed

  • application/7z-compressed

File extensions

  • .7z

  • .7zip

Bases: PackageSupport, Ingestor, ShellSupport

Source code in ingestors/packages/__init__.py
class SevenZipIngestor(PackageSupport, Ingestor, ShellSupport):
    MIME_TYPES = ["application/x-7z-compressed", "application/7z-compressed"]
    EXTENSIONS = ["7z", "7zip"]
    SCORE = 4

    def unpack(self, file_path, entity, temp_dir):
        # check if the file_path belongs to a 7z fragmented archive and reconstruct the filename
        pure_file_path = PurePath(file_path)
        if "_7z" in pure_file_path.parts[-1]:
            reconstructed_filename = pure_file_path.parts[-1].replace("_7z", ".7z")
            pure_file_path = PurePath("/").joinpath(
                *pure_file_path.parts[1:-1], reconstructed_filename
            )

        try:
            with py7zr.SevenZipFile(str(pure_file_path), mode="r") as extractor:
                extractor.extractall(path=temp_dir)
        except ArchiveError as e:
            raise ProcessingException(f"Error: {e}")

AccessIngestor

ingestors.tabular.access.AccessIngestor

File types

  • application/msaccess

  • application/x-msaccess

  • application/vnd.msaccess

  • application/vnd.ms-access

  • application/mdb

  • application/x-mdb

File extensions

  • .mdb

Bases: Ingestor, TableSupport, ShellSupport

Source code in ingestors/tabular/access.py
class AccessIngestor(Ingestor, TableSupport, ShellSupport):
    MIME_TYPES = [
        "application/msaccess",
        "application/x-msaccess",
        "application/vnd.msaccess",
        "application/vnd.ms-access",
        "application/mdb",
        "application/x-mdb",
    ]
    EXTENSIONS = ["mdb"]
    SCORE = 8

    def get_tables(self, local_path):
        mdb_tables = self.find_command("mdb-tables")
        if mdb_tables is None:
            raise RuntimeError("mdb-tools is not available")
        try:
            output = subprocess.check_output([mdb_tables, local_path])
            return [
                t.strip().decode("utf-8") for t in output.split(b" ") if len(t.strip())
            ]
        except subprocess.CalledProcessError as cpe:
            log.warning("Failed to open MDB: %s", cpe)
            raise ProcessingException("Failed to extract Access DB.") from cpe

    def generate_rows(self, file_path, table_name):
        mdb_export = self.find_command("mdb-export")
        if mdb_export is None:
            raise RuntimeError("mdb-tools is not available")
        args = [mdb_export, "-b", "strip", file_path, table_name]
        proc = subprocess.Popen(args, stdout=subprocess.PIPE)
        output = io.TextIOWrapper(proc.stdout, newline=os.linesep)
        headers = None
        for row in csv.reader((line for line in output), delimiter=","):
            if headers is None:
                headers = row
                continue
            yield OrderedDict(zip(headers, row))

    def ingest(self, file_path, entity):
        entity.schema = model.get("Workbook")
        for table_name in self.get_tables(file_path):
            table = self.manager.make_entity("Table", parent=entity)
            table.make_id(entity.id, table_name)
            table.set("title", table_name)
            # Emit a partial table fragment with parent reference and name
            # early, so that we don't have orphan fragments in case of an error
            # in the middle of processing.
            # See https://github.com/alephdata/ingest-file/issues/171
            self.manager.emit_entity(table, fragment="initial")
            rows = self.generate_rows(file_path, table_name)
            self.emit_row_dicts(table, rows)
            self.manager.emit_entity(table)

AudioIngestor

ingestors.media.audio.AudioIngestor

File types

  • audio/mpeg

  • audio/mp3

  • audio/x-m4a

  • audio/x-hx-aac-adts

  • audio/x-wav

  • audio/mp4

  • audio/ogg

  • audio/vnd.wav

  • audio/flac

  • audio/x-ms-wma

  • audio/webm

File extensions

  • .wav

  • .mp3

  • .aac

  • .ac3

  • .m4a

  • .m4b

  • .ogg

  • .opus

  • .flac

  • .wma

Bases: Ingestor, TimestampSupport, TranscriptionSupport

Source code in ingestors/media/audio.py
class AudioIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
    MIME_TYPES = [
        "audio/mpeg",
        "audio/mp3",
        "audio/x-m4a",
        "audio/x-hx-aac-adts",
        "audio/x-wav",
        "audio/mp4",
        "audio/ogg",
        "audio/vnd.wav",
        "audio/flac",
        "audio/x-ms-wma",
        "audio/webm",
    ]
    EXTENSIONS = [
        "wav",
        "mp3",
        "aac",
        "ac3",
        "m4a",
        "m4b",
        "ogg",
        "opus",
        "flac",
        "wma",
    ]
    SCORE = 3

    def ingest(self, file_path, entity):
        try:
            entity.schema = model.get("Audio")
            metadata = MediaInfo.parse(file_path)
            for track in metadata.tracks:
                entity.add("title", track.title)
                entity.add("generator", track.writing_application)
                entity.add("generator", track.writing_library)
                entity.add("generator", track.publisher)
                entity.add("authoredAt", self.parse_timestamp(track.recorded_date))
                entity.add("authoredAt", self.parse_timestamp(track.tagged_date))
                entity.add("authoredAt", self.parse_timestamp(track.encoded_date))
                modified_at = self.parse_timestamp(track.file_last_modification_date)
                entity.add("modifiedAt", modified_at)
                if track.sampling_rate:
                    entity.add("samplingRate", track.sampling_rate)
                entity.add("duration", track.duration)
        except Exception as ex:
            raise ProcessingException(f"Could not read audio: {ex}") from ex
        try:
            self.transcribe(self.manager.dataset, entity, self.manager.context)
        except Exception as ex:
            log.error(f"Could not queue audio for transcription: {ex}")

    @classmethod
    def match(cls, file_path, entity):
        score = super(AudioIngestor, cls).match(file_path, entity)
        if score <= 0:
            for mime_type in entity.get("mimeType"):
                if mime_type.startswith("audio/"):
                    return cls.SCORE * 2
        return score

BZ2Ingestor

ingestors.packages.BZ2Ingestor

File types

  • application/x-bzip

  • application/x-bzip2

  • multipart/x-bzip

  • multipart/x-bzip2

File extensions

  • .bz

  • .tbz

  • .bz2

  • .tbz2

Bases: SingleFilePackageIngestor

Source code in ingestors/packages/__init__.py
class BZ2Ingestor(SingleFilePackageIngestor):
    MIME_TYPES = [
        "application/x-bzip",
        "application/x-bzip2",
        "multipart/x-bzip",
        "multipart/x-bzip2",
    ]
    EXTENSIONS = ["bz", "tbz", "bz2", "tbz2"]

    def unpack_file(self, file_path, temp_file):
        try:
            with bz2.BZ2File(file_path) as src:
                with open(temp_file, "wb") as dst:
                    shutil.copyfileobj(src, dst)
        except IOError as ioe:
            raise ProcessingException("Error: %s" % ioe)

CalendarIngestor

ingestors.email.calendar.CalendarIngestor

File types

  • text/calendar

File extensions

  • .ics

  • .ical

  • .icalendar

  • .ifb

Bases: Ingestor, EncodingSupport

Source code in ingestors/email/calendar.py
class CalendarIngestor(Ingestor, EncodingSupport):
    MIME_TYPES = ["text/calendar"]
    EXTENSIONS = ["ics", "ical", "icalendar", "ifb"]
    SCORE = 10

    def address_entity(self, address):
        email = str(address).strip()
        if email.lower().startswith("mailto:"):
            email = address[len("mailto:") :]
        identity = EmailIdentity(self.manager, None, email)
        return identity.entity

    def ingest_component(self, entity, idx, comp):
        if comp.name == "VCALENDAR":
            entity.add("generator", comp.get("PRODID"))
        if comp.name == "VEVENT":
            event = self.manager.make_entity("Event")
            self.manager.apply_context(event, entity)
            uid = sanitize_text(comp.get("UID"))
            if uid is not None:
                event.make_id(uid)
            else:
                event.make_id(entity.id, idx)
            event.add("proof", entity)
            event.add("name", comp.get("SUMMARY"))
            event.add("description", comp.get("DESCRIPTION"))
            event.add("location", comp.get("LOCATION"))
            event.add("sourceUrl", comp.get("URL"))
            event.add("startDate", cal_date(comp.get("DTSTART")))
            event.add("endDate", cal_date(comp.get("DTEND")))
            event.add("date", cal_date(comp.get("CREATED")))
            event.add("modifiedAt", cal_date(comp.get("LAST-MODIFIED")))
            event.add("organizer", self.address_entity(comp.get("ORGANIZER")))
            for attendee in ensure_list(comp.get("ATTENDEE")):
                event.add("involved", self.address_entity(attendee))
            self.manager.emit_entity(event, fragment=idx)

    def ingest(self, file_path, entity):
        entity.schema = model.get("PlainText")
        entity.add("encoding", "utf-8")
        text = self.read_file_decoded(entity, file_path)
        entity.set("bodyText", text)
        try:
            calendar = icalendar.Calendar.from_ical(text)
            for idx, comp in enumerate(calendar.walk()):
                self.ingest_component(entity, idx, comp)
        except Exception as exc:
            raise ProcessingException("Failed to parse iCalendar") from exc

CSVIngestor

ingestors.tabular.csv.CSVIngestor

Decode and ingest a CSV file.

This expects a properly formatted CSV file with a header in the first row.

File types

  • text/csv

  • text/tsv

  • text/tab-separated-values

File extensions

  • .csv

  • .tsv

Bases: Ingestor, TableSupport

Decode and ingest a CSV file.

This expects a properly formatted CSV file with a header in the first row.

Source code in ingestors/tabular/csv.py
class CSVIngestor(Ingestor, TableSupport):
    """Decode and ingest a CSV file.

    This expects a properly formatted CSV file with a header in the first row.
    """

    MIME_TYPES = ["text/csv", "text/tsv", "text/tab-separated-values"]
    EXTENSIONS = ["csv", "tsv"]
    SCORE = 7

    def ingest(self, file_path, entity):
        entity.schema = model.get("Table")
        with io.open(file_path, "rb") as fh:
            encoding = self.detect_stream_encoding(fh)
            log.debug("Detected encoding [%r]: %s", entity, encoding)

        fh = io.open(file_path, "r", encoding=encoding, errors="replace")
        try:
            sample = fh.read(4096 * 10)
            fh.seek(0)
            dialect = csv.Sniffer().sniff(sample)
            reader = csv.reader(fh, dialect=dialect)
            self.emit_row_tuples(entity, reader)
        except (Exception, UnicodeDecodeError, csv.Error) as err:
            log.warning("CSV error: %s", err)
            raise ProcessingException("Invalid CSV: %s" % err) from err
        finally:
            fh.close()

DBFIngestor

ingestors.tabular.dbf.DBFIngestor

File types

  • application/dbase

  • application/x-dbase

  • application/dbf

  • application/x-dbf

File extensions

  • .dbf

Bases: Ingestor, TableSupport

Source code in ingestors/tabular/dbf.py
class DBFIngestor(Ingestor, TableSupport):
    MIME_TYPES = [
        "application/dbase",
        "application/x-dbase",
        "application/dbf",
        "application/x-dbf",
    ]
    EXTENSIONS = ["dbf"]
    BASE_SCORE = 8

    def generate_rows(self, table):
        headers = [stringify(h) for h in table.field_names]
        for row in table:
            try:
                yield OrderedDict(zip(headers, row))
            except Exception as ex:
                log.warning("Cannot decode DBF row: %s", ex)

    def ingest(self, file_path, entity):
        entity.schema = model.get("Table")
        try:
            table = Table(file_path.as_posix()).open()
            self.emit_row_dicts(entity, self.generate_rows(table))
        except DbfError as err:
            raise ProcessingException("Cannot open DBF file: %s" % err) from err

DjVuIngestor

ingestors.documents.djvu.DjVuIngestor

Read DejaVu E-Books.

File types

  • image/vnd.djvu

  • image/x.djvu

  • image/x-djvu

  • image/djvu

File extensions

Bases: Ingestor, PDFSupport, TempFileSupport

Read DejaVu E-Books.

Source code in ingestors/documents/djvu.py
class DjVuIngestor(Ingestor, PDFSupport, TempFileSupport):
    """Read DejaVu E-Books."""

    MIME_TYPES = [
        "image/vnd.djvu",
        "image/x.djvu",
        "image/x-djvu",
        "image/djvu",
    ]

    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        entity.schema = model.get("Pages")
        pdf_path = self.make_work_file("page.pdf")
        self.exec_command(
            "ddjvu", "-format=pdf", "-quality=100", "-skip", file_path, pdf_path
        )
        self.assert_outfile(pdf_path)
        self.pdf_alternative_extract(entity, pdf_path, self.manager)

ingest(file_path, entity)

Ingestor implementation.

Source code in ingestors/documents/djvu.py
def ingest(self, file_path, entity):
    """Ingestor implementation."""
    entity.schema = model.get("Pages")
    pdf_path = self.make_work_file("page.pdf")
    self.exec_command(
        "ddjvu", "-format=pdf", "-quality=100", "-skip", file_path, pdf_path
    )
    self.assert_outfile(pdf_path)
    self.pdf_alternative_extract(entity, pdf_path, self.manager)

AppleEmlxIngestor

ingestors.email.emlx.AppleEmlxIngestor

File types

File extensions

  • .emlx

Bases: RFC822Ingestor

Source code in ingestors/email/emlx.py
class AppleEmlxIngestor(RFC822Ingestor):
    MIME_TYPES = []
    EXTENSIONS = ["emlx"]
    SCORE = 8

    def ingest(self, file_path, entity):
        entity.schema = model.get("Email")
        try:
            with open(file_path, "rb") as fh:
                msg_len = int(fh.readline().strip())
                data = fh.read(msg_len)
                msg = email.message_from_bytes(data, policy=default)
        except (MessageError, ValueError, IndexError) as err:
            raise ProcessingException("Cannot parse email: %s" % err) from err

        self.ingest_msg(entity, msg)

GzipIngestor

ingestors.packages.GzipIngestor

File types

  • application/gzip

  • application/x-gzip

  • multipart/x-gzip

File extensions

  • .gz

  • .tgz

Bases: SingleFilePackageIngestor

Source code in ingestors/packages/__init__.py
class GzipIngestor(SingleFilePackageIngestor):
    MIME_TYPES = ["application/gzip", "application/x-gzip", "multipart/x-gzip"]
    EXTENSIONS = ["gz", "tgz"]

    def unpack_file(self, file_path, temp_file):
        try:
            with gzip.GzipFile(file_path) as src:
                with open(temp_file, "wb") as dst:
                    shutil.copyfileobj(src, dst)
        except IOError as ioe:
            raise ProcessingException("Error: %s" % ioe)

HTMLIngestor

ingestors.documents.html.HTMLIngestor

HTML file ingestor class. Extracts the text from the web page.

File types

  • text/html

File extensions

  • .htm

  • .html

  • .xhtml

Bases: Ingestor, EncodingSupport, HTMLSupport

HTML file ingestor class. Extracts the text from the web page.

Source code in ingestors/documents/html.py
class HTMLIngestor(Ingestor, EncodingSupport, HTMLSupport):
    "HTML file ingestor class. Extracts the text from the web page."

    MIME_TYPES = ["text/html"]
    EXTENSIONS = [
        "htm",
        "html",
        "xhtml",
    ]
    SCORE = 6

    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        entity.schema = model.get("HyperText")
        html_body = self.read_file_decoded(entity, file_path)
        self.extract_html_content(entity, html_body)

ingest(file_path, entity)

Ingestor implementation.

Source code in ingestors/documents/html.py
def ingest(self, file_path, entity):
    """Ingestor implementation."""
    entity.schema = model.get("HyperText")
    html_body = self.read_file_decoded(entity, file_path)
    self.extract_html_content(entity, html_body)

IgnoreIngestor

ingestors.ignore.IgnoreIngestor

File types

  • application/x-pkcs7-mime

  • application/pkcs7-mime

  • application/pkcs7-signature

  • application/x-pkcs7-signature

  • application/x-pkcs12application/pgp-encrypted

  • application/x-shockwave-flash

  • application/vnd.apple.pkpass

  • application/x-executable

  • application/x-mach-binary

  • application/x-sharedlib

  • application/x-dosexec

  • application/x-java-keystore

  • application/java-archive

  • application/font-sfnt

  • application/vnd.ms-office.vbaproject

  • application/x-x509-ca-cert

  • text/calendar

  • text/css

  • application/vnd.ms-opentype

  • application/x-font-ttf

File extensions

  • .json

  • .exe

  • .dll

  • .ini

  • .class

  • .jar

  • .psd

  • .indd

  • .sql

  • .dat

  • .log

  • .pbl

  • .p7m

  • .plist

  • .ics

  • .axd

Bases: Ingestor

Source code in ingestors/ignore.py
class IgnoreIngestor(Ingestor):
    MIME_TYPES = [
        "application/x-pkcs7-mime",
        "application/pkcs7-mime",
        "application/pkcs7-signature",
        "application/x-pkcs7-signature",
        "application/x-pkcs12" "application/pgp-encrypted",
        "application/x-shockwave-flash",
        "application/vnd.apple.pkpass",
        "application/x-executable",
        "application/x-mach-binary",
        "application/x-sharedlib",
        "application/x-dosexec",
        "application/x-java-keystore",
        "application/java-archive",
        "application/font-sfnt",
        "application/vnd.ms-office.vbaproject",
        "application/x-x509-ca-cert",
        "text/calendar",
        "text/css",
        "application/vnd.ms-opentype",
        "application/x-font-ttf",
    ]
    EXTENSIONS = [
        "json",
        "exe",
        "dll",
        "ini",
        "class",
        "jar",
        "psd",  # adobe photoshop
        "indd",  # adobe indesign
        "sql",
        "dat",
        "log",
        "pbl",
        "p7m",
        "plist",
        "ics",
        "axd",
    ]
    NAMES = [".DS_Store", "Thumbs.db", ".gitignore"]
    SCORE = 2

    def ingest(self, file_path, entity):
        log.info("[%r] will be ignored but stored.", entity)

    @classmethod
    def match(cls, file_path, entity):
        for file_size in entity.get("fileSize"):
            if int(file_size) == 0:
                return cls.SCORE * 100
        for file_name in entity.get("fileName"):
            if file_name in cls.NAMES:
                return cls.SCORE
        return super(IgnoreIngestor, cls).match(file_path, entity)

ImageIngestor

ingestors.media.image.ImageIngestor

Image file ingestor class. Extracts the text from images using OCR.

File types

  • image/x-portable-graymap

  • image/png

  • image/x-png

  • image/jpeg

  • image/jpg

  • image/gif

  • image/pjpeg

  • image/bmp

  • image/x-windows-bmp

  • image/x-portable-bitmap

  • image/x-coreldraw

  • application/postscript

  • image/vnd.dxf

File extensions

  • .jpg

  • .jpe

  • .jpeg

  • .png

  • .gif

  • .bmp

Bases: Ingestor, OCRSupport, TimestampSupport

Image file ingestor class. Extracts the text from images using OCR.

Source code in ingestors/media/image.py
class ImageIngestor(Ingestor, OCRSupport, TimestampSupport):
    """Image file ingestor class. Extracts the text from images using OCR."""

    MIME_TYPES = [
        "image/x-portable-graymap",
        "image/png",
        "image/x-png",
        "image/jpeg",
        "image/jpg",
        "image/gif",
        "image/pjpeg",
        "image/bmp",
        "image/x-windows-bmp",
        "image/x-portable-bitmap",
        "image/x-coreldraw",
        "application/postscript",
        "image/vnd.dxf",
    ]
    EXTENSIONS = ["jpg", "jpe", "jpeg", "png", "gif", "bmp"]
    SCORE = 10

    def extract_exif(self, img, entity):
        if not hasattr(img, "_getexif"):
            return

        exif = img._getexif()
        if exif is None:
            return

        for num, value in exif.items():
            try:
                tag = ExifTags.TAGS[num]
            except KeyError:
                log.warning("Unknown EXIF code: %s", num)
                continue
            if tag == "DateTimeOriginal":
                entity.add("authoredAt", self.parse_timestamp(value))
            if tag == "DateTime":
                entity.add("date", self.parse_timestamp(value))
            if tag == "Make":
                entity.add("generator", value)
            if tag == "Model":
                entity.add("generator", value)

    def ingest(self, file_path, entity):
        entity.schema = model.get("Image")
        with open(file_path, "rb") as fh:
            data = fh.read()

        try:
            image = Image.open(BytesIO(data))
            image.load()
            self.extract_exif(image, entity)
            languages = self.manager.context.get("languages")
            text = self.extract_ocr_text(data, languages=languages)
            entity.add("bodyText", text)
        except (OSError, IOError, Exception) as err:
            raise ProcessingException("Failed to open image: %s" % err)

    @classmethod
    def match(cls, file_path, entity):
        score = super(ImageIngestor, cls).match(file_path, entity)
        if score <= 0:
            for mime_type in entity.get("mimeType"):
                if mime_type.startswith("image/"):
                    score = cls.SCORE - 1
        return score

JSONIngestor

ingestors.misc.jsonfile.JSONIngestor

File types

  • application/json

  • text/javascript

File extensions

  • .json

Bases: Ingestor, EncodingSupport

Source code in ingestors/misc/jsonfile.py
class JSONIngestor(Ingestor, EncodingSupport):
    MIME_TYPES = [
        "application/json",
        "text/javascript",
    ]
    EXTENSIONS = ["json"]
    MAX_SIZE = 100 * MEGABYTE
    SCORE = 3

    def _collect_text(self, obj):
        if isinstance(obj, (list, set, tuple)):
            for item in obj:
                yield from self._collect_text(item)
        if isinstance(obj, dict):
            for item in obj.values():
                yield from self._collect_text(item)
        if isinstance(obj, str):
            yield obj

    def ingest(self, file_path, entity):
        for file_size in entity.get("fileSize"):
            if int(file_size) > self.MAX_SIZE:
                raise ProcessingException("JSON file is too large.")

        with open(file_path, "rb") as fh:
            encoding = self.detect_stream_encoding(fh)

        with open(file_path, "r", encoding=encoding) as fh:
            try:
                data = json.load(fh)
                for idx, text in enumerate(self._collect_text(data)):
                    self.manager.emit_text_fragment(entity, [text], idx)
            except Exception as exc:
                raise ProcessingException("Cannot parse JSON file: %s" % exc) from exc

MboxFileIngestor

ingestors.email.mbox.MboxFileIngestor

File types

  • application/mbox

File extensions

  • .mbox

Bases: RFC822Ingestor, TempFileSupport

Source code in ingestors/email/mbox.py
class MboxFileIngestor(RFC822Ingestor, TempFileSupport):
    DEFAULT_MIME = "application/mbox"
    MIME_TYPES = [DEFAULT_MIME]
    EXTENSIONS = ["mbox"]
    MAGIC = "From "
    SCORE = 6

    def ingest(self, file_path, entity):
        mbox = mailbox.mbox(file_path)
        entity.schema = model.get("Package")
        entity.add("mimeType", self.DEFAULT_MIME)

        for i, msg in enumerate(mbox.itervalues(), 1):
            # Is there a risk of https://bugs.python.org/issue27321 ?
            try:
                msg_path = self.make_work_file("%s.eml" % i)
                with open(msg_path, "wb") as fh:
                    gen = BytesGenerator(fh, policy=default)
                    gen.flatten(msg)
                checksum = self.manager.store(msg_path, mime_type=RFC822)
                msg_path.unlink()
                child = self.manager.make_entity("Email", parent=entity)
                child.make_id(checksum)
                child.add("contentHash", checksum)
                child.add("mimeType", RFC822)
                self.manager.queue_entity(child)
            except Exception:
                log.exception("[%r] Cannot extract message %s", entity, i)

    @classmethod
    def match(cls, file_path, entity):
        score = super(MboxFileIngestor, cls).match(file_path, entity)
        if score < 0:
            # this was added because a lot of mbox files are just called
            # 'inbox' or 'new', without a file suffix.
            with open(file_path, "rb") as fh:
                if fh.read(len(cls.MAGIC)) == cls.MAGIC:
                    mbox = mailbox.mbox(file_path)
                    for _ in mbox:
                        return cls.SCORE
        return score

RFC822Ingestor

ingestors.email.msg.RFC822Ingestor

File types

  • multipart/mixed

  • message/rfc822

File extensions

  • .eml

  • .rfc822

  • .email

  • .msg

Bases: Ingestor, EmailSupport, EncodingSupport

Source code in ingestors/email/msg.py
class RFC822Ingestor(Ingestor, EmailSupport, EncodingSupport):
    MIME_TYPES = ["multipart/mixed", "message/rfc822"]
    BODY_HTML = "text/html"
    BODY_PLAIN = "text/plain"
    BODY_TYPES = [BODY_HTML, BODY_PLAIN]
    BODY_RFC822 = "message/rfc822"
    DISPLAY_HEADERS = ["from", "to", "cc", "bcc", "subject", "reply-to", "date"]
    EXTENSIONS = ["eml", "rfc822", "email", "msg"]
    SCORE = 7

    def has_alternative(self, parent, content_type):
        if not parent:
            return False

        if normalize_mimetype(parent.get_content_type()) != "multipart/alternative":
            return False

        for part in parent.get_payload():
            if normalize_mimetype(part.get_content_type()) == content_type:
                return True

        return False

    def make_html_alternative(self, text):
        if not text:
            return None

        return escape(text).strip().replace("\n", "<br>")

    def decode_part(self, part):
        charset = part.get_content_charset()
        payload = part.get_payload(decode=True)
        return self.decode_string(payload, charset)

    def parse_html_part(self, entity, part, parent):
        payload = self.decode_part(part)
        text = self.extract_html_content(
            entity, payload, extract_metadata=False, add_index_text=False
        )

        if not self.has_alternative(parent, "text/plain"):
            entity.add("bodyText", text)

    def parse_plaintext_part(self, entity, part, parent):
        payload = self.decode_part(part)
        entity.add("bodyText", payload)

        if not self.has_alternative(parent, "text/html"):
            html = self.make_html_alternative(payload)
            entity.add("bodyHtml", html)

    def parse_rfc822_part(self, entity, part, parent):
        msg = part.get_payload(0)
        headers = [
            f"{name}: {value}"
            for name, value in msg.items()
            if name.lower() in self.DISPLAY_HEADERS
        ]
        text = "\n".join(headers)
        html = self.make_html_alternative(text)
        entity.add("bodyText", text)
        entity.add("bodyHtml", html)

        self.parse_parts(entity, part)

    def parse_part(self, entity, part, parent):
        mime_type = normalize_mimetype(part.get_content_type())
        file_name = part.get_filename()
        is_body_type = mime_type in self.BODY_TYPES
        is_attachment = part.is_attachment()
        is_attachment = is_attachment or file_name is not None
        is_attachment = is_attachment or (not is_body_type and not part.is_multipart())

        if is_attachment:
            if part.is_multipart():
                # The attachment is an email
                payload = str(part.get_payload(i=0))
            else:
                payload = part.get_payload(decode=True)
            self.ingest_attachment(entity, file_name, mime_type, payload)
            return

        if self.BODY_RFC822 in mime_type:
            self.parse_rfc822_part(entity, part, parent)
            return

        if part.is_multipart():
            self.parse_parts(entity, part)
            return

        if self.BODY_HTML in mime_type:
            self.parse_html_part(entity, part, parent)
            return

        if self.BODY_PLAIN in mime_type:
            self.parse_plaintext_part(entity, part, parent)
            return

        log.error("Dangling MIME fragment: %s", part)

    def parse_parts(self, entity, parent):
        for part in parent.get_payload():
            self.parse_part(entity, part, parent)

    def ingest_msg(self, entity, msg):
        self.extract_msg_headers(entity, msg)
        self.resolve_message_ids(entity)

        if msg.is_multipart():
            self.parse_parts(entity, msg)
        else:
            self.parse_part(entity, msg, None)

    def ingest(self, file_path, entity):
        entity.schema = model.get("Email")
        try:
            with open(file_path, "rb") as fh:
                msg = email.message_from_binary_file(fh, policy=default)
        except (MessageError, ValueError, IndexError) as err:
            raise ProcessingException("Cannot parse email: %s" % err) from err

        self.ingest_msg(entity, msg)

OpenOfficeSpreadsheetIngestor

ingestors.tabular.ods.OpenOfficeSpreadsheetIngestor

File types

  • application/vnd.oasis.opendocument.spreadsheet

  • application/vnd.oasis.opendocument.spreadsheet-template

File extensions

  • .ods

  • .ots

Bases: Ingestor, TableSupport, OpenDocumentSupport

Source code in ingestors/tabular/ods.py
class OpenOfficeSpreadsheetIngestor(Ingestor, TableSupport, OpenDocumentSupport):
    MIME_TYPES = [
        "application/vnd.oasis.opendocument.spreadsheet",
        "application/vnd.oasis.opendocument.spreadsheet-template",
    ]
    EXTENSIONS = ["ods", "ots"]
    SCORE = 7
    VALUE_FIELDS = ["date-value", "time-value", "boolean-value", "value"]

    def convert_cell(self, cell):
        cell_type = cell.getAttrNS(OFFICENS, "value-type")
        if cell_type == "currency":
            value = cell.getAttrNS(OFFICENS, "value")
            currency = cell.getAttrNS(OFFICENS, cell_type)
            if value is None:
                return None
            if currency is None:
                return value
            return value + " " + currency

        for field in self.VALUE_FIELDS:
            value = cell.getAttrNS(OFFICENS, field)
            if value is not None:
                return value

        return self.read_text_cell(cell)

    def read_text_cell(self, cell):
        content = []
        for paragraph in cell.getElementsByType(P):
            content.append(extractText(paragraph))
        return "\n".join(content)

    def generate_csv(self, table):
        for row in table.getElementsByType(TableRow):
            values = []
            for cell in row.getElementsByType(TableCell):
                repeat = cell.getAttribute("numbercolumnsrepeated") or 1
                value = self.convert_cell(cell)
                for i in range(int(repeat)):
                    values.append(value)
            yield values

    def ingest(self, file_path, entity):
        entity.schema = model.get("Workbook")
        doc = self.parse_opendocument(file_path, entity)
        for sheet in doc.spreadsheet.getElementsByType(Table):
            name = sheet.getAttribute("name")
            table = self.manager.make_entity("Table", parent=entity)
            table.make_id(entity.id, name)
            table.set("title", name)
            # Emit a partial table fragment with parent reference and name
            # early, so that we don't have orphan fragments in case of an error
            # in the middle of processing.
            # See https://github.com/alephdata/ingest-file/issues/171
            self.manager.emit_entity(table, fragment="initial")
            self.emit_row_tuples(table, self.generate_csv(sheet))
            if table.has("csvHash"):
                self.manager.emit_entity(table)

DocumentIngestor

ingestors.documents.office.DocumentIngestor

Office/Word document ingestor class.

Converts the document to PDF and extracts the text. Mostly a slightly adjusted PDF ingestor.

Requires system tools:

  • Open/Libre Office with dependencies
  • image ingestor dependencies to cover any embeded images OCR

File types

  • text/richtext

  • text/rtf

  • application/rtf

  • application/x-rtf

  • application/msword

  • application/vnd.ms-word

  • application/wordperfect

  • application/vnd.wordperfect

  • application/vnd.ms-powerpoint

  • application/vnd.sun.xml.impress

  • application/vnd.ms-powerpoint.presentation

  • application/vnd.ms-powerpoint.presentation.12

  • application/CDFV2-unknown

  • application/CDFV2-corruptapplication/clarisworks

  • application/epub+zip

  • application/macwriteii

  • application/msword

  • application/prs.plucker

  • application/vnd.corel-draw

  • application/vnd.lotus-wordpro

  • application/vnd.ms-powerpoint

  • application/vnd.ms-powerpoint.presentation.macroEnabled.main+xml

  • application/vnd.ms-works

  • application/vnd.palm

  • application/vnd.sun.xml.draw

  • application/vnd.sun.xml.draw.template

  • application/vnd.sun.xml.impress

  • application/vnd.sun.xml.impress.template

  • application/vnd.sun.xml.writer

  • application/vnd.sun.xml.writer.global

  • application/vnd.sun.xml.writer.template

  • application/vnd.sun.xml.writer.web

  • application/vnd.visio

  • application/vnd.wordperfect

  • application/x-abiword

  • application/x-aportisdoc

  • application/x-fictionbook+xml

  • application/x-hwp

  • application/x-iwork-keynote-sffkey

  • application/x-iwork-pages-sffpages

  • application/x-mspublisher

  • application/x-mswrite

  • application/x-pagemaker

  • application/x-sony-bbeb

  • application/x-t602

  • image/x-cmx

  • image/x-freehand

  • image/x-wpg

File extensions

  • .602

  • .abw

  • .cdr

  • .cmx

  • .cwk

  • .doc

  • .dot

  • .dps

  • .dpt

  • .epub

  • .fb2

  • .fh

  • .fh1

  • .fh10

  • .fh11

  • .fh2

  • .fh3

  • .fh4

  • .fh5

  • .fh6

  • .fh7

  • .fh8

  • .fh9

  • .fodg

  • .fodp

  • .fodt

  • .hwp

  • .key

  • .lrf

  • .lwp

  • .mcw

  • .mw

  • .mwd

  • .nxd

  • .odg

  • .odm

  • .otg

  • .oth

  • .otm

  • .otp

  • .ott

  • .p65

  • .pages

  • .pdb

  • .pm

  • .pm6

  • .pmd

  • .pot

  • .pps

  • .ppt

  • .pub

  • .qxd

  • .qxt

  • .rtf

  • .sda

  • .sdd

  • .sdw

  • .std

  • .sti

  • .stw

  • .sxd

  • .sxg

  • .sxi

  • .sxw

  • .vdx

  • .vsd

  • .vsdm

  • .vsdx

  • .wn

  • .wpd

  • .wpg

  • .wps

  • .wpt

  • .wri

  • .xlc

  • .xlm

  • .xls

  • .xlw

  • .zabw

  • .zmf

Bases: Ingestor, OLESupport, PDFSupport

Office/Word document ingestor class.

Converts the document to PDF and extracts the text. Mostly a slightly adjusted PDF ingestor.

Requires system tools:

  • Open/Libre Office with dependencies
  • image ingestor dependencies to cover any embeded images OCR
Source code in ingestors/documents/office.py
class DocumentIngestor(Ingestor, OLESupport, PDFSupport):
    """Office/Word document ingestor class.

    Converts the document to PDF and extracts the text.
    Mostly a slightly adjusted PDF ingestor.

    Requires system tools:

    - Open/Libre Office with dependencies
    - image ingestor dependencies to cover any embeded images OCR
    """

    MIME_TYPES = [
        # Text documents
        "text/richtext",
        "text/rtf",
        "application/rtf",
        "application/x-rtf",
        "application/msword",
        "application/vnd.ms-word",
        "application/wordperfect",
        "application/vnd.wordperfect",
        # Presentations
        "application/vnd.ms-powerpoint",
        "application/vnd.sun.xml.impress",
        "application/vnd.ms-powerpoint.presentation",
        "application/vnd.ms-powerpoint.presentation.12",
        # MS Office files with short stream missing
        "application/CDFV2-unknown",
        "application/CDFV2-corrupt" "application/clarisworks",  # ClarisWorks_Draw
        "application/epub+zip",  # EPUB Document
        "application/macwriteii",  # MacWrite
        "application/msword",  # MS Word 2007 XML VBA
        "application/prs.plucker",  # Plucker eBook
        "application/vnd.corel-draw",  # Corel Draw Document
        "application/vnd.lotus-wordpro",  # LotusWordPro
        "application/vnd.ms-powerpoint",  # MS PowerPoint 97 Vorlage
        "application/vnd.ms-powerpoint.presentation.macroEnabled.main+xml",  # Impress MS PowerPoint 2007 XML VBA  # noqa
        "application/vnd.ms-works",  # Mac_Works
        "application/vnd.palm",  # Palm_Text_Document
        "application/vnd.sun.xml.draw",  # StarOffice XML (Draw)
        "application/vnd.sun.xml.draw.template",  # draw_StarOffice_XML_Draw_Template  # noqa
        "application/vnd.sun.xml.impress",  # StarOffice XML (Impress)
        "application/vnd.sun.xml.impress.template",  # impress_StarOffice_XML_Impress_Template  # noqa
        "application/vnd.sun.xml.writer",  # StarOffice XML (Writer)
        "application/vnd.sun.xml.writer.global",  # writer_globaldocument_StarOffice_XML_Writer_GlobalDocument  # noqa
        "application/vnd.sun.xml.writer.template",  # writer_StarOffice_XML_Writer_Template  # noqa
        "application/vnd.sun.xml.writer.web",  # writer_web_StarOffice_XML_Writer_Web_Template  # noqa
        "application/vnd.visio",  # Visio Document
        "application/vnd.wordperfect",  # WordPerfect
        "application/x-abiword",  # AbiWord
        "application/x-aportisdoc",  # PalmDoc
        "application/x-fictionbook+xml",  # FictionBook 2
        "application/x-hwp",  # writer_MIZI_Hwp_97
        "application/x-iwork-keynote-sffkey",  # Apple Keynote
        "application/x-iwork-pages-sffpages",  # Apple Pages
        "application/x-mspublisher",  # Publisher Document
        "application/x-mswrite",  # MS_Write
        "application/x-pagemaker",  # PageMaker Document
        "application/x-sony-bbeb",  # BroadBand eBook
        "application/x-t602",  # T602Document
        "image/x-cmx",  # Corel Presentation Exchange
        "image/x-freehand",  # Freehand Document
        "image/x-wpg",  # WordPerfect Graphics
    ]
    EXTENSIONS = [
        "602",  # T602Document
        "abw",  # AbiWord
        "cdr",  # Corel Draw Document
        "cmx",  # Corel Presentation Exchange
        "cwk",  # ClarisWorks_Draw
        "doc",  # Mac_Word
        "dot",  # MS Word 97 Vorlage
        "dps",  # MS PowerPoint 97
        "dpt",  # MS PowerPoint 97 Vorlage
        "epub",  # EPUB Document
        "fb2",  # FictionBook 2
        "fh",  # Freehand Document
        "fh1",  # Freehand Document
        "fh10",  # Freehand Document
        "fh11",  # Freehand Document
        "fh2",  # Freehand Document
        "fh3",  # Freehand Document
        "fh4",  # Freehand Document
        "fh5",  # Freehand Document
        "fh6",  # Freehand Document
        "fh7",  # Freehand Document
        "fh8",  # Freehand Document
        "fh9",  # Freehand Document
        "fodg",  # OpenDocument Drawing Flat XML
        "fodp",  # OpenDocument Presentation Flat XML
        "fodt",  # OpenDocument Text Flat XML
        "hwp",  # writer_MIZI_Hwp_97
        "key",  # Apple Keynote
        "lrf",  # BroadBand eBook
        "lwp",  # LotusWordPro
        "mcw",  # MacWrite
        "mw",  # MacWrite
        "mwd",  # Mariner_Write
        "nxd",  # WriteNow
        "odg",  # draw8
        "odm",  # writerglobal8
        "otg",  # draw8_template
        "oth",  # writerweb8_writer_template
        "otm",  # writerglobal8_template
        "otp",  # impress8_template
        "ott",  # writer8_template
        "p65",  # PageMaker Document
        "pages",  # Apple Pages
        "pdb",  # Palm_Text_Document
        "pm",  # PageMaker Document
        "pm6",  # PageMaker Document
        "pmd",  # PageMaker Document
        "pot",  # PowerPoint 3
        "pps",  # MS PowerPoint 97 AutoPlay
        "ppt",  # PowerPoint 3
        # 'pptm',  # Impress Office Open XML
        "pub",  # Publisher Document
        "qxd",  # QXP Document
        "qxt",  # QXP Document
        "rtf",  # Rich Text Format
        "sda",  # StarOffice_Drawing
        "sdd",  # StarOffice_Presentation
        "sdw",  # StarOffice_Writer
        "std",  # draw_StarOffice_XML_Draw_Template
        "sti",  # impress_StarOffice_XML_Impress_Template
        "stw",  # writer_StarOffice_XML_Writer_Template
        "sxd",  # StarOffice XML (Draw)
        "sxg",  # writer_globaldocument_StarOffice_XML_Writer_GlobalDocument
        "sxi",  # StarOffice XML (Impress)
        "sxw",  # StarOffice XML (Writer)
        # 'tab',  # Text
        # 'tsv',  # Text
        # 'txt',  # Text
        "vdx",  # Visio Document
        "vsd",  # Visio Document
        "vsdm",  # Visio Document
        "vsdx",  # Visio Document
        "wn",  # WriteNow
        "wpd",  # WordPerfect
        "wpg",  # WordPerfect Graphics
        "wps",  # Mac_Works
        "wpt",  # MS Word 97 Vorlage
        "wri",  # MS_Write
        "xlc",  # MS Excel 95
        "xlm",  # MS Excel 95
        "xls",  # MS Excel 95
        "xlw",  # MS Excel 95
        # 'xml',  # OpenDocument Drawing Flat XML
        "zabw",  # AbiWord
        # 'zip',  # FictionBook 2
        "zmf",  # ZMF Document
    ]
    SCORE = 6

    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        entity.schema = model.get("Pages")
        self.extract_ole_metadata(file_path, entity)
        with tempfile.TemporaryDirectory() as unique_tmpdir:
            # TODO - write to logs the case in which the context manager can't delete these dirs
            pdf_path = self.document_to_pdf(unique_tmpdir, file_path, entity)
            self.pdf_alternative_extract(entity, pdf_path, self.manager)

ingest(file_path, entity)

Ingestor implementation.

Source code in ingestors/documents/office.py
def ingest(self, file_path, entity):
    """Ingestor implementation."""
    entity.schema = model.get("Pages")
    self.extract_ole_metadata(file_path, entity)
    with tempfile.TemporaryDirectory() as unique_tmpdir:
        # TODO - write to logs the case in which the context manager can't delete these dirs
        pdf_path = self.document_to_pdf(unique_tmpdir, file_path, entity)
        self.pdf_alternative_extract(entity, pdf_path, self.manager)

OutlookMsgIngestor

ingestors.email.outlookmsg.OutlookMsgIngestor

File types

  • application/msg

  • application/x-msg

  • application/vnd.ms-outlook

  • msg/rfc822

File extensions

  • .msg

Bases: Ingestor, EmailSupport, OLESupport, TempFileSupport

Source code in ingestors/email/outlookmsg.py
class OutlookMsgIngestor(Ingestor, EmailSupport, OLESupport, TempFileSupport):
    MIME_TYPES = [
        "application/msg",
        "application/x-msg",
        "application/vnd.ms-outlook",
        "msg/rfc822",
    ]
    EXTENSIONS = ["msg"]
    SCORE = 10

    def get_identity(self, name, email):
        return EmailIdentity(self.manager, name, email)

    def ingest(self, file_path, entity):
        entity.schema = model.get("Email")
        try:
            msg = Message(file_path.as_posix())
        except Exception as exc:
            msg = "Cannot open message file: %s" % exc
            raise ProcessingException(msg) from exc

        self.extract_olefileio_metadata(msg.ole, entity)
        self.ingest_message(msg, entity)

    def ingest_message(self, msg, entity):
        try:
            self.extract_msg_headers(entity, msg.header)
        except Exception:
            log.exception("Cannot parse Outlook-stored headers")

        entity.add("subject", msg.subject)
        entity.add("threadTopic", msg.getStringField("0070"))
        entity.add("encoding", msg.encoding)
        entity.add("bodyText", msg.body)
        entity.add("bodyHtml", msg.htmlBody)
        entity.add("messageId", self.parse_message_ids(msg.message_id))

        try:
            rtf_body = msg.rtfBody
        except Exception:
            log.exception("Cannot parse RTF body of the email")
            rtf_body = None

        if rtf_body is not None:
            rtf_path = self.make_work_file("body.rtf")
            with open(rtf_path, "wb") as fh:
                fh.write(rtf_body)
            checksum = self.manager.store(rtf_path, mime_type=RTF_MIME)
            rtf_path.unlink()

            child = self.manager.make_entity("Document", parent=entity)
            child.make_id(entity.id, "outlook-msg.rtf.body")
            child.add("fileName", "body.rtf")
            child.add("contentHash", checksum)
            child.add("mimeType", RTF_MIME)
            self.manager.queue_entity(child)

        if not entity.has("inReplyTo"):
            entity.add("inReplyTo", self.parse_references(msg.references, []))

        try:
            date = parsedate_to_datetime(msg.date).isoformat()
            entity.add("date", date)
        except Exception:
            log.warning("Could not parse date: %s", msg.date)

        # sender name and email
        sender = self.get_identities(msg.sender)
        self.apply_identities(entity, sender, "emitters", "sender")

        # received by
        sender = self.get_identity(
            msg.getStringField("0040"), msg.getStringField("0076")
        )
        self.apply_identities(entity, sender, "emitters")

        froms = self.get_identities(msg.getStringField("1046"))
        self.apply_identities(entity, froms, "emitters", "from")

        tos = self.get_identities(msg.to)
        self.apply_identities(entity, tos, "recipients", "to")

        ccs = self.get_identities(msg.cc)
        self.apply_identities(entity, ccs, "recipients", "cc")

        bccs = self.get_identities(msg.bcc)
        self.apply_identities(entity, bccs, "recipients", "bcc")

        self.resolve_message_ids(entity)
        for attachment in msg.attachments:
            if attachment.type == "msg":
                child = self.manager.make_entity("Email", parent=entity)
                child.make_id(entity.id, attachment.data.prefix)
                child.add("fileName", attachment.long_filename)
                child.add("fileName", attachment.short_filename)
                child.add("mimeType", "application/vnd.ms-outlook")
                self.ingest_message(attachment.data, child)
                self.manager.emit_entity(child, fragment=attachment.data.prefix)
            if attachment.type == "data":
                name = stringify(attachment.long_filename)
                name = name or stringify(attachment.short_filename)
                self.ingest_attachment(
                    entity, name, attachment.content_type, attachment.data
                )

    @classmethod
    def match(cls, file_path, entity):
        score = super(OutlookMsgIngestor, cls).match(file_path, entity)
        if score > 0 and not isOleFile(file_path):
            return -1
        return score

OutlookOLMArchiveIngestor

ingestors.email.olm.OutlookOLMArchiveIngestor

File types

File extensions

  • .olm

Bases: Ingestor, TempFileSupport, XMLSupport

Source code in ingestors/email/olm.py
class OutlookOLMArchiveIngestor(Ingestor, TempFileSupport, XMLSupport):
    MIME_TYPES = []
    EXTENSIONS = ["olm"]
    SCORE = 10
    EXCLUDE = ["com.microsoft.__Messages"]

    def extract_file(self, zipf, name):
        """Extract a message file from the OLM zip archive"""
        path = pathlib.Path(name)
        base_name = safe_filename(path.name)
        out_file = self.make_work_file(base_name)
        with open(out_file, "w+b") as outfh:
            try:
                with zipf.open(name) as infh:
                    shutil.copyfileobj(infh, outfh)
            except KeyError:
                log.warning("Cannot load zip member: %s", name)
        return out_file

    def extract_hierarchy(self, entity, name):
        """Given a file path, create all its ancestor folders as entities"""
        foreign_id = pathlib.PurePath(entity.id)
        path = ensure_path(name)
        for name in path.as_posix().split("/")[:-1]:
            foreign_id = foreign_id.joinpath(name)
            if name in self.EXCLUDE:
                continue
            entity = self.manager.make_entity("Folder", parent=entity)
            entity.add("fileName", name)
            entity.make_id(foreign_id.as_posix())
            self.manager.emit_entity(entity)
        return entity

    def extract_attachment(self, zipf, message, attachment):
        """Create an entity for an attachment; assign its parent and put it
        on the task queue to be processed"""
        url = attachment.get("OPFAttachmentURL")
        name = attachment.get("OPFAttachmentName")
        name = name or attachment.get("OPFAttachmentContentID")
        child = self.manager.make_entity("Document", parent=message)
        if url is not None:
            file_path = self.extract_file(zipf, url)
            mime_type = attachment.get("OPFAttachmentContentType")
            checksum = self.manager.store(file_path, mime_type=mime_type)
            child.make_id(name, checksum)
            child.add("fileName", attachment.get("OPFAttachmentName"))
            child.add("fileName", attachment.get("OPFAttachmentContentID"))
            child.add("mimeType", mime_type)
            child.add("contentHash", checksum)
            self.manager.queue_entity(child)

    def extract_message(self, root, zipf, name):
        # Individual messages are stored as message_xxx.xml files. We want to
        # process these files and skip the others
        if "message_" not in name or not name.endswith(".xml"):
            return
        # Create the parent folders as entities with proper hierarchy
        parent = self.extract_hierarchy(root, name)
        # Extract the xml file itself and put it on the task queue to be
        # ingested by OutlookOLMMessageIngestor as an individual message
        xml_path = self.extract_file(zipf, name)
        checksum = self.manager.store(xml_path, mime_type=MIME)
        child = self.manager.make_entity("Document", parent=parent)
        child.make_id(checksum)
        child.add("contentHash", checksum)
        child.add("mimeType", MIME)
        self.manager.queue_entity(child)
        try:
            doc = self.parse_xml_path(xml_path)
            # find all attachments mentioned in the current xml file, assign
            # them their parent and put them on the queue to be processed
            for el in doc.findall(".//messageAttachment"):
                self.extract_attachment(zipf, child, el)
        except ProcessingException:
            pass

    def ingest(self, file_path, entity):
        entity.schema = model.get("Package")
        self._hierarchy = {}
        try:
            # OLM files are zip archives with emails stored as xml files
            with zipfile.ZipFile(file_path, "r") as zipf:
                for name in zipf.namelist():
                    try:
                        self.extract_message(entity, zipf, name)
                    except Exception:
                        log.exception("Error processing message: %s", name)
        except zipfile.BadZipfile:
            raise ProcessingException("Invalid OLM file.")

extract_attachment(zipf, message, attachment)

Create an entity for an attachment; assign its parent and put it on the task queue to be processed

Source code in ingestors/email/olm.py
def extract_attachment(self, zipf, message, attachment):
    """Create an entity for an attachment; assign its parent and put it
    on the task queue to be processed"""
    url = attachment.get("OPFAttachmentURL")
    name = attachment.get("OPFAttachmentName")
    name = name or attachment.get("OPFAttachmentContentID")
    child = self.manager.make_entity("Document", parent=message)
    if url is not None:
        file_path = self.extract_file(zipf, url)
        mime_type = attachment.get("OPFAttachmentContentType")
        checksum = self.manager.store(file_path, mime_type=mime_type)
        child.make_id(name, checksum)
        child.add("fileName", attachment.get("OPFAttachmentName"))
        child.add("fileName", attachment.get("OPFAttachmentContentID"))
        child.add("mimeType", mime_type)
        child.add("contentHash", checksum)
        self.manager.queue_entity(child)

extract_file(zipf, name)

Extract a message file from the OLM zip archive

Source code in ingestors/email/olm.py
def extract_file(self, zipf, name):
    """Extract a message file from the OLM zip archive"""
    path = pathlib.Path(name)
    base_name = safe_filename(path.name)
    out_file = self.make_work_file(base_name)
    with open(out_file, "w+b") as outfh:
        try:
            with zipf.open(name) as infh:
                shutil.copyfileobj(infh, outfh)
        except KeyError:
            log.warning("Cannot load zip member: %s", name)
    return out_file

extract_hierarchy(entity, name)

Given a file path, create all its ancestor folders as entities

Source code in ingestors/email/olm.py
def extract_hierarchy(self, entity, name):
    """Given a file path, create all its ancestor folders as entities"""
    foreign_id = pathlib.PurePath(entity.id)
    path = ensure_path(name)
    for name in path.as_posix().split("/")[:-1]:
        foreign_id = foreign_id.joinpath(name)
        if name in self.EXCLUDE:
            continue
        entity = self.manager.make_entity("Folder", parent=entity)
        entity.add("fileName", name)
        entity.make_id(foreign_id.as_posix())
        self.manager.emit_entity(entity)
    return entity

OfficeOpenXMLIngestor

ingestors.documents.ooxml.OfficeOpenXMLIngestor

Office/Word document ingestor class.

Converts the document to PDF and extracts the text. Mostly a slightly adjusted PDF ingestor.

File types

  • application/vnd.openxmlformats-officedocument.wordprocessingml.document

  • application/vnd.openxmlformats-officedocument.wordprocessingml.template

  • application/vnd.openxmlformats-officedocument.presentationml.slideshow

  • application/vnd.openxmlformats-officedocument.presentationml.presentation

  • application/vnd.openxmlformats-officedocument.presentationml.template

  • application/vnd.openxmlformats-officedocument.presentationml.slideshow

File extensions

  • .docx

  • .docm

  • .dotx

  • .dotm

  • .potx

  • .pptx

  • .ppsx

  • .pptm

  • .ppsm

  • .potm

Bases: Ingestor, OOXMLSupport, PDFSupport

Office/Word document ingestor class.

Converts the document to PDF and extracts the text. Mostly a slightly adjusted PDF ingestor.

Source code in ingestors/documents/ooxml.py
class OfficeOpenXMLIngestor(Ingestor, OOXMLSupport, PDFSupport):
    """Office/Word document ingestor class.

    Converts the document to PDF and extracts the text.
    Mostly a slightly adjusted PDF ingestor.
    """

    PREFIX = "application/vnd.openxmlformats-officedocument."
    MIME_TYPES = [
        PREFIX + "wordprocessingml.document",
        PREFIX + "wordprocessingml.template",
        PREFIX + "presentationml.slideshow",
        PREFIX + "presentationml.presentation",
        PREFIX + "presentationml.template",
        PREFIX + "presentationml.slideshow",
    ]
    EXTENSIONS = [
        "docx",
        "docm",
        "dotx",
        "dotm",
        "potx",
        "pptx",
        "ppsx",
        "pptm",
        "ppsm",
        "potm",
    ]
    SCORE = 7

    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        entity.schema = model.get("Pages")
        self.ooxml_extract_metadata(file_path, entity)
        with tempfile.TemporaryDirectory() as unique_tmpdir:
            # TODO - write to logs the case in which the context manager can't delete these dirs
            pdf_path = self.document_to_pdf(unique_tmpdir, file_path, entity)
            self.pdf_alternative_extract(entity, pdf_path, self.manager)

    @classmethod
    def match(cls, file_path, entity):
        score = super(OfficeOpenXMLIngestor, cls).match(file_path, entity)
        if score > 0 and cls.inspect_ooxml_manifest(file_path):
            score = cls.SCORE * 2
        return score

ingest(file_path, entity)

Ingestor implementation.

Source code in ingestors/documents/ooxml.py
def ingest(self, file_path, entity):
    """Ingestor implementation."""
    entity.schema = model.get("Pages")
    self.ooxml_extract_metadata(file_path, entity)
    with tempfile.TemporaryDirectory() as unique_tmpdir:
        # TODO - write to logs the case in which the context manager can't delete these dirs
        pdf_path = self.document_to_pdf(unique_tmpdir, file_path, entity)
        self.pdf_alternative_extract(entity, pdf_path, self.manager)

OpenDocumentIngestor

ingestors.documents.opendoc.OpenDocumentIngestor

Office/Word document ingestor class.

Converts the document to PDF and extracts the text. Mostly a slightly adjusted PDF ingestor.

Requires system tools:

  • Open/Libre Office with dependencies
  • image ingestor dependencies to cover any embeded images OCR

File types

  • application/vnd.oasis.opendocument.text

  • application/vnd.oasis.opendocument.text-template

  • application/vnd.oasis.opendocument.presentation

  • application/vnd.oasis.opendocument.graphics

  • application/vnd.oasis.opendocument.graphics-flat-xml

  • application/vnd.oasis.opendocument.graphics-templateapplication/vnd.oasis.opendocument.presentation-flat-xml

  • application/vnd.oasis.opendocument.presentation-template

  • application/vnd.oasis.opendocument.chart

  • application/vnd.oasis.opendocument.chart-template

  • application/vnd.oasis.opendocument.image

  • application/vnd.oasis.opendocument.image-template

  • application/vnd.oasis.opendocument.formula

  • application/vnd.oasis.opendocument.formula-template

  • application/vnd.oasis.opendocument.text-flat-xml

  • application/vnd.oasis.opendocument.text-master

  • application/vnd.oasis.opendocument.text-web

File extensions

  • .odt

  • .odp

  • .otp

Bases: Ingestor, OpenDocumentSupport, PDFSupport

Office/Word document ingestor class.

Converts the document to PDF and extracts the text. Mostly a slightly adjusted PDF ingestor.

Requires system tools:

  • Open/Libre Office with dependencies
  • image ingestor dependencies to cover any embeded images OCR
Source code in ingestors/documents/opendoc.py
class OpenDocumentIngestor(Ingestor, OpenDocumentSupport, PDFSupport):
    """Office/Word document ingestor class.

    Converts the document to PDF and extracts the text.
    Mostly a slightly adjusted PDF ingestor.

    Requires system tools:

    - Open/Libre Office with dependencies
    - image ingestor dependencies to cover any embeded images OCR

    """

    MIME_TYPES = [
        "application/vnd.oasis.opendocument.text",
        "application/vnd.oasis.opendocument.text-template",
        "application/vnd.oasis.opendocument.presentation",
        "application/vnd.oasis.opendocument.graphics",
        "application/vnd.oasis.opendocument.graphics-flat-xml",
        "application/vnd.oasis.opendocument.graphics-template"
        "application/vnd.oasis.opendocument.presentation-flat-xml",
        "application/vnd.oasis.opendocument.presentation-template",
        "application/vnd.oasis.opendocument.chart",
        "application/vnd.oasis.opendocument.chart-template",
        "application/vnd.oasis.opendocument.image",
        "application/vnd.oasis.opendocument.image-template",
        "application/vnd.oasis.opendocument.formula",
        "application/vnd.oasis.opendocument.formula-template",
        "application/vnd.oasis.opendocument.text-flat-xml",
        "application/vnd.oasis.opendocument.text-master",
        "application/vnd.oasis.opendocument.text-web",
    ]
    EXTENSIONS = ["odt", "odp", "otp"]
    SCORE = 7

    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        entity.schema = model.get("Pages")
        self.parse_opendocument(file_path, entity)
        with tempfile.TemporaryDirectory() as unique_tmpdir:
            # TODO - write to logs the case in which the context manager can't delete these dirs
            pdf_path = self.document_to_pdf(unique_tmpdir, file_path, entity)
            self.pdf_alternative_extract(entity, pdf_path, self.manager)

ingest(file_path, entity)

Ingestor implementation.

Source code in ingestors/documents/opendoc.py
def ingest(self, file_path, entity):
    """Ingestor implementation."""
    entity.schema = model.get("Pages")
    self.parse_opendocument(file_path, entity)
    with tempfile.TemporaryDirectory() as unique_tmpdir:
        # TODO - write to logs the case in which the context manager can't delete these dirs
        pdf_path = self.document_to_pdf(unique_tmpdir, file_path, entity)
        self.pdf_alternative_extract(entity, pdf_path, self.manager)

OutlookOLMMessageIngestor

ingestors.email.olm.OutlookOLMMessageIngestor

File types

  • application/xml+opfmessage

File extensions

Bases: Ingestor, XMLSupport, EmailSupport, TimestampSupport

Source code in ingestors/email/olm.py
class OutlookOLMMessageIngestor(Ingestor, XMLSupport, EmailSupport, TimestampSupport):
    MIME_TYPES = [MIME]
    EXTENSIONS = []
    SCORE = 15

    def get_contacts(self, doc, tag):
        path = "./%s/emailAddress" % tag
        for address in doc.findall(path):
            name = address.get("OPFContactEmailAddressName")
            email = address.get("OPFContactEmailAddressAddress")
            yield EmailIdentity(self.manager, name, email)

    def get_date(self, props, tag):
        return self.parse_timestamp(props.pop(tag, None))

    def ingest(self, file_path, entity):
        entity.schema = model.get("Email")
        try:
            doc = self.parse_xml_path(file_path)
        except TypeError as terr:
            raise ProcessingException("Cannot parse OPF XML file.") from terr

        if len(doc.findall(".//email")) != 1:
            raise ProcessingException("More than one email in file.")

        email = doc.find(".//email")
        props = email.getchildren()
        props = {c.tag: stringify(c.text) for c in props if c.text}
        # from pprint import pformat
        # log.info(pformat(props))

        entity.add("subject", props.pop("OPFMessageCopySubject", None))
        entity.add("threadTopic", props.pop("OPFMessageCopyThreadTopic", None))
        entity.add("summary", props.pop("OPFMessageCopyPreview", None))
        # message IDs are already parsed, no need to clean prior:
        entity.add("messageId", props.pop("OPFMessageCopyMessageID", None))
        entity.add("date", self.get_date(props, "OPFMessageCopySentTime"))
        entity.add("modifiedAt", self.get_date(props, "OPFMessageCopyModDate"))

        senders = self.get_contacts(email, "OPFMessageCopySenderAddress")
        self.apply_identities(entity, senders, "emitters", "sender")

        froms = self.get_contacts(  # codespell:ignore
            email, "OPFMessageCopyFromAddresses"
        )
        self.apply_identities(entity, froms, "emitters", "from")  # codespell:ignore

        tos = self.get_contacts(email, "OPFMessageCopyToAddresses")
        self.apply_identities(entity, tos, "recipients", "to")

        ccs = self.get_contacts(email, "OPFMessageCopyCCAddresses")
        self.apply_identities(entity, ccs, "recipients", "cc")

        bccs = self.get_contacts(email, "OPFMessageCopyBCCAddresses")
        self.apply_identities(entity, bccs, "recipients", "bcc")

        entity.add("bodyText", props.pop("OPFMessageCopyBody", None))
        html = props.pop("OPFMessageCopyHTMLBody", None)
        has_html = "1E0" == props.pop("OPFMessageGetHasHTML", None)
        if has_html and stringify(html):
            self.extract_html_content(entity, html, extract_metadata=False)

        self.resolve_message_ids(entity)

PDFIngestor

ingestors.documents.pdf.PDFIngestor

PDF file ingestor class.

Extracts the text from the document by converting it first to XML. Splits the file into pages.

File types

  • application/pdf

File extensions

  • .pdf

Bases: Ingestor, PDFSupport

PDF file ingestor class.

Extracts the text from the document by converting it first to XML. Splits the file into pages.

Source code in ingestors/documents/pdf.py
class PDFIngestor(Ingestor, PDFSupport):
    """PDF file ingestor class.

    Extracts the text from the document by converting it first to XML.
    Splits the file into pages.
    """

    MAGIC = "%PDF-1."
    MIME_TYPES = ["application/pdf"]
    EXTENSIONS = ["pdf"]
    SCORE = 6

    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        try:
            self.parse_and_ingest(file_path, entity, self.manager)
        except UnauthorizedError as pwe:
            raise ProcessingException(ENCRYPTED_MSG) from pwe
        except Exception as ex:
            raise ProcessingException("Could not extract PDF file: %r" % ex) from ex

    @classmethod
    def match(cls, file_path, entity):
        score = super(PDFIngestor, cls).match(file_path, entity)
        if score <= 0:
            with open(file_path, "rb") as fh:
                if fh.read(len(cls.MAGIC)) == cls.MAGIC:
                    return cls.SCORE * 2
        return score

ingest(file_path, entity)

Ingestor implementation.

Source code in ingestors/documents/pdf.py
def ingest(self, file_path, entity):
    """Ingestor implementation."""
    try:
        self.parse_and_ingest(file_path, entity, self.manager)
    except UnauthorizedError as pwe:
        raise ProcessingException(ENCRYPTED_MSG) from pwe
    except Exception as ex:
        raise ProcessingException("Could not extract PDF file: %r" % ex) from ex

PlainTextIngestor

ingestors.documents.plain.PlainTextIngestor

Plan text file ingestor class.

Extracts the text from the document and enforces unicode on it.

File types

  • text/plain

  • text/x-c

  • text/x-c++

  • text/x-diff

  • text/x-python

  • text/x-shellscript

  • text/x-java

  • text/x-php

  • text/troff

  • text/x-ruby

  • text/x-pascal

  • text/x-msdos-batch

  • text/x-yaml

  • text/x-makefile

  • text/x-perl

  • text/x-objective-c

  • text/x-msdos-batch

  • text/x-asm

  • text/x-csrc

  • text/x-sh

  • text/javascript

  • text/x-algol68

File extensions

  • .txt

  • .md

  • .rst

  • .nfo

Bases: Ingestor, EncodingSupport

Plan text file ingestor class.

Extracts the text from the document and enforces unicode on it.

Source code in ingestors/documents/plain.py
class PlainTextIngestor(Ingestor, EncodingSupport):
    """Plan text file ingestor class.

    Extracts the text from the document and enforces unicode on it.
    """

    MIME_TYPES = [
        "text/plain",
        "text/x-c",
        "text/x-c++",
        "text/x-diff",
        "text/x-python",
        "text/x-shellscript",
        "text/x-java",
        "text/x-php",
        "text/troff",
        "text/x-ruby",
        "text/x-pascal",
        "text/x-msdos-batch",
        "text/x-yaml",
        "text/x-makefile",
        "text/x-perl",  # %^&%*^&%*%^
        "text/x-objective-c",
        "text/x-msdos-batch",
        "text/x-asm",
        "text/x-csrc",
        "text/x-sh",
        "text/javascript",
        "text/x-algol68",
    ]
    EXTENSIONS = ["txt", "md", "rst", "nfo"]
    MAX_SIZE = 4 * 1024 * 1024
    SCORE = 1

    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        entity.schema = model.get("PlainText")
        for file_size in entity.get("fileSize"):
            if int(file_size) > self.MAX_SIZE:
                raise ProcessingException("Text file is too large.")

        text = self.read_file_decoded(entity, file_path)
        entity.set("bodyText", text)

ingest(file_path, entity)

Ingestor implementation.

Source code in ingestors/documents/plain.py
def ingest(self, file_path, entity):
    """Ingestor implementation."""
    entity.schema = model.get("PlainText")
    for file_size in entity.get("fileSize"):
        if int(file_size) > self.MAX_SIZE:
            raise ProcessingException("Text file is too large.")

    text = self.read_file_decoded(entity, file_path)
    entity.set("bodyText", text)

OutlookPSTIngestor

ingestors.email.outlookpst.OutlookPSTIngestor

File types

  • application/vnd.ms-outlook

File extensions

  • .pst

  • .ost

  • .pab

Bases: Ingestor, TempFileSupport, OLESupport, ShellSupport

Source code in ingestors/email/outlookpst.py
class OutlookPSTIngestor(Ingestor, TempFileSupport, OLESupport, ShellSupport):
    MIME_TYPES = ["application/vnd.ms-outlook"]
    EXTENSIONS = ["pst", "ost", "pab"]
    BASE_SCORE = 5
    COMMAND_TIMEOUT = 12 * 60 * 60

    def ingest(self, file_path, entity):
        entity.schema = model.get("Package")
        self.extract_ole_metadata(file_path, entity)
        temp_dir = self.make_empty_directory()
        try:
            self.exec_command(
                "readpst",
                "-e",  # make subfolders, files per message
                "-D",  # include deleted
                "-8",  # utf-8 where possible
                "-cv",  # export vcards
                "-o",
                temp_dir,
                file_path,
            )
            self.manager.delegate(DirectoryIngestor, temp_dir, entity)
        except Exception:
            log.exception("Failed to unpack PST.")
            # Handle partially extracted archives.
            self.manager.delegate(DirectoryIngestor, temp_dir, entity)
            raise

RARIngestor

ingestors.packages.rar.RARIngestor

File types

  • application/rarapplication/x-rar

File extensions

  • .rar

Bases: PackageSupport, Ingestor

Source code in ingestors/packages/rar.py
class RARIngestor(PackageSupport, Ingestor):
    MIME_TYPES = ["application/rar" "application/x-rar"]
    EXTENSIONS = ["rar"]
    SCORE = 4

    def unpack(self, file_path, entity, temp_dir):
        # FIXME: need to figure out how to unpack multi-part files.
        try:
            with rarfile.RarFile(file_path.as_posix()) as rf:
                names = rf.namelist()
                encoding = self.detect_list_encoding(names)
                log.debug("Detected filename encoding: %s", encoding)

                for name in names:
                    try:
                        fh = rf.open(name)
                        self.extract_member(temp_dir, name, fh, encoding=encoding)
                    except Exception as exc:
                        # TODO: should this be a fatal error?
                        log.warning("Failed to unpack [%s]: %s", name, exc)
        except rarfile.NeedFirstVolume as nfv:
            raise ProcessingException("Cannot load RAR partials") from nfv
        except rarfile.PasswordRequired as pr:
            raise ProcessingException(str(pr)) from pr
        except (rarfile.Error, TypeError) as err:
            raise ProcessingException("Invalid RAR file: %s" % err) from err

    @classmethod
    def match(cls, file_path, entity):
        # doesn't accept pathlib.Path object
        if rarfile.is_rarfile(file_path.as_posix()):
            return cls.SCORE
        return super(RARIngestor, cls).match(file_path, entity)

SQLiteIngestor

ingestors.tabular.sqlite.SQLiteIngestor

File types

  • application/x-sqlite3

  • application/x-sqlite

  • application/sqlite3

  • application/sqlite

File extensions

  • .sqlite3

  • .sqlite

  • .db

Bases: Ingestor, TableSupport

Source code in ingestors/tabular/sqlite.py
class SQLiteIngestor(Ingestor, TableSupport):
    VALID_TABLE = re.compile(r"[\w\d\_\-]{2,4096}")
    MIME_TYPES = [
        "application/x-sqlite3",
        "application/x-sqlite",
        "application/sqlite3",
        "application/sqlite",
    ]
    EXTENSIONS = ["sqlite3", "sqlite", "db"]
    SCORE = 8

    def get_tables(self, conn):
        c = conn.cursor()
        c.execute("SELECT name FROM sqlite_master WHERE type = 'table';")
        for (name,) in c.fetchall():
            if self.VALID_TABLE.match(name):
                yield name

    def generate_rows(self, conn, table):
        cur = conn.cursor()
        try:
            # FIXME make this a parameter somehow.
            # see https://stackoverflow.com/questions/39196462
            cur.execute("SELECT * FROM %s;" % table)
        except sqlite3.OperationalError as oe:
            log.warning("SQLite error: %s", oe)
            raise ProcessingException("Cannot query table: %s" % table) from oe

        headers = [i[0] for i in cur.description]
        while True:
            try:
                row = cur.fetchone()
                if row is None:
                    return
                yield OrderedDict(zip(headers, row))
            except sqlite3.OperationalError as oe:
                log.warning("SQLite error: %s", oe)

    def ingest(self, file_path, entity):
        entity.schema = model.get("Workbook")
        conn = sqlite3.connect(file_path)
        try:
            for table_name in self.get_tables(conn):
                table = self.manager.make_entity("Table", parent=entity)
                table.make_id(entity, table_name)
                table.set("title", table_name)
                # Emit a partial table fragment with parent reference and name
                # early, so that we don't have orphan fragments in case of an error
                # in the middle of processing.
                # See https://github.com/alephdata/ingest-file/issues/171
                self.manager.emit_entity(table, fragment="initial")
                rows = self.generate_rows(conn, table_name)
                self.emit_row_dicts(table, rows)
                self.manager.emit_entity(table)
        finally:
            conn.close()

    @classmethod
    def match(cls, file_path, entity):
        score = super(SQLiteIngestor, cls).match(file_path, entity)
        if score > 0:
            try:
                conn = sqlite3.connect(file_path)
                conn.execute("SELECT * FROM sqlite_master;").fetchall()
                return score
            except Exception:
                pass
        return -1

SVGIngestor

ingestors.media.svg.SVGIngestor

File types

  • image/svg+xml

File extensions

  • .svg

Bases: Ingestor, EncodingSupport, HTMLSupport

Source code in ingestors/media/svg.py
class SVGIngestor(Ingestor, EncodingSupport, HTMLSupport):
    MIME_TYPES = ["image/svg+xml"]
    EXTENSIONS = ["svg"]
    SCORE = 20

    def ingest(self, file_path, entity):
        entity.schema = model.get("HyperText")
        html_body = self.read_file_decoded(entity, file_path)
        text = self.extract_html_content(entity, html_body)
        entity.add("bodyText", text)

TarIngestor

ingestors.packages.tar.TarIngestor

File types

  • application/tar

  • application/x-tar

  • application/x-tgz

  • application/x-gtar

File extensions

  • .tar

Bases: PackageSupport, Ingestor

Source code in ingestors/packages/tar.py
class TarIngestor(PackageSupport, Ingestor):
    MIME_TYPES = [
        "application/tar",
        "application/x-tar",
        "application/x-tgz",
        "application/x-gtar",
    ]
    EXTENSIONS = ["tar"]
    SCORE = 4

    def unpack(self, file_path, entity, temp_dir):
        try:
            with tarfile.open(name=file_path, mode="r:*") as tf:
                names = tf.getnames()
                encoding = self.detect_list_encoding(names, default=tf.encoding)
                log.debug("Detected filename encoding: %s", encoding)

                for name in names:
                    try:
                        fh = tf.extractfile(name)
                        self.extract_member(temp_dir, name, fh, encoding=encoding)
                    except Exception:
                        # TODO: should this be a fatal error?
                        log.exception("Failed to unpack: %r", name)
        except (tarfile.TarError, IOError, EOFError) as err:
            raise ProcessingException("Invalid Tar file: %s" % err) from err

    @classmethod
    def match(cls, file_path, entity):
        if tarfile.is_tarfile(file_path):
            return cls.SCORE
        return super(TarIngestor, cls).match(file_path, entity)

TIFFIngestor

ingestors.media.tiff.TIFFIngestor

TIFF appears to not really be an image format. Who knew?

File types

  • image/tiff

  • image/x-tiff

File extensions

  • .tif

  • .tiff

Bases: Ingestor, PDFSupport, TempFileSupport, ShellSupport

TIFF appears to not really be an image format. Who knew?

Source code in ingestors/media/tiff.py
class TIFFIngestor(Ingestor, PDFSupport, TempFileSupport, ShellSupport):
    """TIFF appears to not really be an image format. Who knew?"""

    MIME_TYPES = [
        "image/tiff",
        "image/x-tiff",
    ]
    EXTENSIONS = ["tif", "tiff"]
    SCORE = 11

    def ingest(self, file_path, entity):
        entity.schema = model.get("Pages")
        pdf_path = self.make_work_file("tiff.pdf")
        try:
            self.exec_command(
                "tiff2pdf",
                file_path,
                "-n",
                "-j",
                "-x",
                "300",
                "-y",
                "300",
                "-o",
                pdf_path,
            )
        except ProcessingException:
            self.exec_command(
                "tiff2pdf", file_path, "-x", "300", "-y", "300", "-o", pdf_path
            )

        self.assert_outfile(pdf_path)

        self.pdf_alternative_extract(entity, pdf_path, self.manager)

VCardIngestor

ingestors.email.vcard.VCardIngestor

File types

  • text/vcard

  • text/x-vcard

File extensions

  • .vcf

  • .vcard

Bases: Ingestor, EncodingSupport

Source code in ingestors/email/vcard.py
class VCardIngestor(Ingestor, EncodingSupport):
    MIME_TYPES = ["text/vcard", "text/x-vcard"]
    EXTENSIONS = ["vcf", "vcard"]
    SCORE = 10

    def get_field(self, card: Component, field: str):
        items: list[ContentLine] = ensure_list(card.contents.get(field))
        return [str(i.value) for i in items]

    def ingest_card(self, entity, card):
        person = self.manager.make_entity("Person")
        person.add("proof", entity.id)
        person.add("name", self.get_field(card, "n"))
        person.add("name", self.get_field(card, "fn"))
        person.add("gender", self.get_field(card, "gender"))
        person.add("birthDate", self.get_field(card, "bday"))
        person.add("position", self.get_field(card, "title"))
        person.add("summary", self.get_field(card, "note"))
        person.add("keywords", self.get_field(card, "categories"))
        person.add("phone", self.get_field(card, "tel"))
        person.add("weakAlias", self.get_field(card, "nickname"))
        for email in self.get_field(card, "email"):
            key = email.strip().lower()
            if len(key):
                person.make_id(key)
            person.add("email", email)
        if person.id:
            self.manager.apply_context(person, entity)
            self.manager.emit_entity(person)

    def ingest(self, file_path, entity):
        entity.schema = model.get("PlainText")
        text = self.read_file_decoded(entity, file_path)
        text = sanitize_text(text)
        entity.set("bodyText", text)
        try:
            for card in vobject.readComponents(text, allowQP=True):
                self.ingest_card(entity, card)
        except (ParseError, UnicodeDecodeError) as err:
            raise ProcessingException("Cannot parse vcard: %s" % err) from err

VideoIngestor

ingestors.media.video.VideoIngestor

File types

  • application/x-shockwave-flash

  • video/quicktime

  • video/mp4

  • video/x-flv

File extensions

  • .avi

  • .mpg

  • .mpeg

  • .mkv

  • .mp4

  • .mov

Bases: Ingestor, TimestampSupport, TranscriptionSupport

Source code in ingestors/media/video.py
class VideoIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
    MIME_TYPES = [
        "application/x-shockwave-flash",
        "video/quicktime",
        "video/mp4",
        "video/x-flv",
    ]
    EXTENSIONS = [
        "avi",
        "mpg",
        "mpeg",
        "mkv",
        "mp4",
        "mov",
    ]
    SCORE = 3

    def ingest(self, file_path, entity):
        try:
            entity.schema = model.get("Video")
            log.info("[%r] flagged as video.", entity)
            metadata = MediaInfo.parse(file_path)
            for track in metadata.tracks:
                entity.add("title", track.title)
                entity.add("generator", track.writing_application)
                entity.add("generator", track.writing_library)
                entity.add("generator", track.publisher)
                entity.add("authoredAt", self.parse_timestamp(track.recorded_date))
                entity.add("authoredAt", self.parse_timestamp(track.tagged_date))
                entity.add("authoredAt", self.parse_timestamp(track.encoded_date))
                modified_at = self.parse_timestamp(track.file_last_modification_date)
                entity.add("modifiedAt", modified_at)
                entity.add("duration", track.duration)
        except Exception as ex:
            raise ProcessingException("Could not read video: %r", ex) from ex
        try:
            self.transcribe(self.manager.dataset, entity, self.manager.context)
        except Exception as ex:
            log.error(f"Could not queue audio for transcription: {ex}")

    @classmethod
    def match(cls, file_path, entity):
        score = super(VideoIngestor, cls).match(file_path, entity)
        if score <= 0:
            for mime_type in entity.get("mimeType"):
                if mime_type.startswith("video/"):
                    return cls.SCORE * 2
        return score

ExcelIngestor

ingestors.tabular.xls.ExcelIngestor

File types

  • application/excel

  • application/x-excel

  • application/vnd.ms-excel

  • application/x-msexcel

File extensions

  • .xls

  • .xlt

  • .xla

Bases: Ingestor, TableSupport, OLESupport

Source code in ingestors/tabular/xls.py
class ExcelIngestor(Ingestor, TableSupport, OLESupport):
    MIME_TYPES = [
        "application/excel",
        "application/x-excel",
        "application/vnd.ms-excel",
        "application/x-msexcel",
    ]
    EXTENSIONS = ["xls", "xlt", "xla"]
    SCORE = 7

    def convert_cell(self, cell, sheet):
        value = cell.value
        try:
            if cell.ctype == 3:
                if value == 0:
                    return None
                year, month, day, hour, minute, second = xlrd.xldate_as_tuple(
                    value, sheet.book.datemode
                )
                if (year, month, day) == (0, 0, 0):
                    value = time(hour, minute, second)
                    return value.isoformat()
                else:
                    return datetime(year, month, day, hour, minute, second)
        except Exception as exc:
            log.warning("Error in Excel value [%s]: %s", cell, exc)
        return value

    def generate_csv(self, sheet):
        for row_index in range(0, sheet.nrows):
            yield [self.convert_cell(c, sheet) for c in sheet.row(row_index)]

    def ingest(self, file_path, entity):
        entity.schema = model.get("Workbook")
        self.extract_ole_metadata(file_path, entity)
        try:
            book = xlrd.open_workbook(file_path, formatting_info=False)
        except XLRDError:
            raise ProcessingException(ENCRYPTED_MSG)
        except Exception as err:
            raise ProcessingException("Invalid Excel file: %s" % err) from err

        try:
            for sheet in book.sheets():
                table = self.manager.make_entity("Table", parent=entity)
                table.make_id(entity.id, sheet.name)
                table.set("title", sheet.name)
                # Emit a partial table fragment with parent reference and name
                # early, so that we don't have orphan fragments in case of an error
                # in the middle of processing.
                # See https://github.com/alephdata/ingest-file/issues/171
                self.manager.emit_entity(table, fragment="initial")
                self.emit_row_tuples(table, self.generate_csv(sheet))
                if table.has("csvHash"):
                    self.manager.emit_entity(table)
        except XLRDError as err:
            raise ProcessingException("Invalid Excel file: %s" % err) from err
        finally:
            book.release_resources()

ExcelXMLIngestor

ingestors.tabular.xlsx.ExcelXMLIngestor

File types

  • application/vnd.openxmlformats-officedocument.spreadsheetml.sheet

  • application/vnd.openxmlformats-officedocument.spreadsheetml.template

  • application/vnd.ms-excel.sheet.macroenabled.12

  • application/vnd.ms-excel.sheet.binary.macroenabled.12

  • application/vnd.ms-excel.template.macroenabled.12

  • application/vnd.ms-excel.sheet.macroEnabled.main+xml

File extensions

  • .xlsx

  • .xlsm

  • .xltx

  • .xltm

Bases: Ingestor, TableSupport, OOXMLSupport

Source code in ingestors/tabular/xlsx.py
class ExcelXMLIngestor(Ingestor, TableSupport, OOXMLSupport):
    MIME_TYPES = [
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",  # noqa
        "application/vnd.openxmlformats-officedocument.spreadsheetml.template",  # noqa
        "application/vnd.ms-excel.sheet.macroenabled.12",
        "application/vnd.ms-excel.sheet.binary.macroenabled.12",
        "application/vnd.ms-excel.template.macroenabled.12",
        "application/vnd.ms-excel.sheet.macroEnabled.main+xml",
    ]
    EXTENSIONS = ["xlsx", "xlsm", "xltx", "xltm"]
    SCORE = 7

    def generate_rows(self, sheet):
        for row in sheet.rows:
            try:
                yield [c.value for c in row]
            except (ValueError, OverflowError, ParseError) as ve:
                log.warning("Failed to read Excel row: %s", ve)

    def ingest(self, file_path, entity):
        entity.schema = model.get("Workbook")
        self.ooxml_extract_metadata(file_path, entity)
        try:
            book = load_workbook(file_path, read_only=True)
        except Exception as err:
            raise ProcessingException("Invalid Excel file: %s" % err) from err

        try:
            for name in book.sheetnames:
                sheet = book[name]
                if not hasattr(sheet, "rows"):
                    log.warning("Cannot parse chart sheet: %s", name)
                    continue
                table = self.manager.make_entity("Table", parent=entity)
                table.make_id(entity.id, name)
                table.set("title", name)
                # Emit a partial table fragment with parent reference and name
                # early, so that we don't have orphan fragments in case of an error
                # in the middle of processing.
                # See https://github.com/alephdata/ingest-file/issues/171
                self.manager.emit_entity(table, fragment="initial")
                log.debug("Sheet: %s", name)
                self.emit_row_tuples(table, self.generate_rows(sheet))
                if table.has("csvHash"):
                    self.manager.emit_entity(table)
        except Exception as err:
            raise ProcessingException("Cannot read Excel file: %s" % err) from err
        finally:
            book.close()

    @classmethod
    def match(cls, file_path, entity):
        score = super(ExcelXMLIngestor, cls).match(file_path, entity)
        if score > 0 and not cls.inspect_ooxml_manifest(file_path):
            return -1
        return score

XMLIngestor

ingestors.documents.xml.XMLIngestor

XML file ingestor class. Generates a tabular HTML representation.

File types

  • text/xml

File extensions

  • .xml

Bases: Ingestor, EncodingSupport, XMLSupport, HTMLSupport

XML file ingestor class. Generates a tabular HTML representation.

Source code in ingestors/documents/xml.py
class XMLIngestor(Ingestor, EncodingSupport, XMLSupport, HTMLSupport):
    "XML file ingestor class. Generates a tabular HTML representation."

    MIME_TYPES = ["text/xml"]
    EXTENSIONS = ["xml"]
    SCORE = 1
    MAX_SIZE = 4 * 1024 * 1024
    XSLT = etree.XML(
        b"""<?xml version="1.0" encoding="UTF-8"?>
        <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
            version="1.0">
        <xsl:output omit-xml-declaration="yes" indent="yes"/>
        <xsl:strip-space elements="*"/>

        <xsl:template match="/">
            <table>
            <xsl:apply-templates/>
            </table>
        </xsl:template>

        <xsl:template match="*">
            <tr>
            <td>
                <p><xsl:value-of select="name()"/></p>
            </td>
            <td>
                <p><xsl:value-of select="."/></p>
            </td>
            </tr>
        </xsl:template>

        <xsl:template match="*[*]">
            <tr>
            <td>
                <p><xsl:value-of select="name()"/></p>
            </td>
            <td>
                <table>
                <xsl:apply-templates/>
                </table>
            </td>
            </tr>
        </xsl:template>

        </xsl:stylesheet>"""
    )

    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        entity.schema = model.get("HyperText")
        for file_size in entity.get("fileSize"):
            if int(file_size) > self.MAX_SIZE:
                raise ProcessingException("XML file is too large.")

        doc = self.parse_xml_path(file_path)
        text = self.extract_html_text(doc.getroot())
        entity.set("bodyText", text)
        try:
            transform = etree.XSLT(self.XSLT)
            html_doc = transform(doc)
            html_body = html.tostring(html_doc, encoding=str, pretty_print=True)
            entity.set("bodyHtml", html_body)
        except ValueError as ve:
            raise ProcessingException("Error converting XML file: %s" % ve) from ve

ingest(file_path, entity)

Ingestor implementation.

Source code in ingestors/documents/xml.py
def ingest(self, file_path, entity):
    """Ingestor implementation."""
    entity.schema = model.get("HyperText")
    for file_size in entity.get("fileSize"):
        if int(file_size) > self.MAX_SIZE:
            raise ProcessingException("XML file is too large.")

    doc = self.parse_xml_path(file_path)
    text = self.extract_html_text(doc.getroot())
    entity.set("bodyText", text)
    try:
        transform = etree.XSLT(self.XSLT)
        html_doc = transform(doc)
        html_body = html.tostring(html_doc, encoding=str, pretty_print=True)
        entity.set("bodyHtml", html_body)
    except ValueError as ve:
        raise ProcessingException("Error converting XML file: %s" % ve) from ve

ZipIngestor

ingestors.packages.zip.ZipIngestor

File types

  • application/zip

  • application/x-zip

  • multipart/x-zip

  • application/zip-compressed

  • application/x-zip-compressed

File extensions

  • .zip

Bases: PackageSupport, Ingestor

Source code in ingestors/packages/zip.py
class ZipIngestor(PackageSupport, Ingestor):
    MIME_TYPES = [
        "application/zip",
        "application/x-zip",
        "multipart/x-zip",
        "application/zip-compressed",
        "application/x-zip-compressed",
    ]
    EXTENSIONS = ["zip"]
    SCORE = 3

    def unpack(self, file_path, entity, temp_dir):
        try:
            with zipfile.ZipFile(file_path) as zf:
                names = zf.namelist()
                encoding = self.detect_list_encoding(names)
                log.debug("Detected filename encoding: %s", encoding)
                for name in names:
                    try:
                        info = zf.getinfo(name)
                        if info.is_dir():
                            continue

                        with zf.open(name) as fh:
                            self.extract_member(temp_dir, name, fh, encoding=encoding)
                    except Exception as ex:
                        # TODO: should this be a fatal error?
                        log.debug("Failed to unpack [%r]: %s", name, ex)
        except (zipfile.BadZipfile, UnicodeDecodeError, OSError) as bzfe:
            raise ProcessingException("Invalid ZIP file: %s" % bzfe) from bzfe

    @classmethod
    def match(cls, file_path, entity):
        if zipfile.is_zipfile(file_path):
            return cls.SCORE
        return super(ZipIngestor, cls).match(file_path, entity)