Ingestors

SevenZipIngestor

ingestors.packages.SevenZipIngestor

File types

application/x-7z-compressed
application/7z-compressed

File extensions

.7z
.7zip

Bases: PackageSupport, Ingestor, ShellSupport

Source code in ingestors/packages/__init__.py

class SevenZipIngestor(PackageSupport, Ingestor, ShellSupport):
    MIME_TYPES = ["application/x-7z-compressed", "application/7z-compressed"]
    EXTENSIONS = ["7z", "7zip"]
    SCORE = 4

    def unpack(self, file_path, entity, temp_dir):
        # check if the file_path belongs to a 7z fragmented archive and reconstruct the filename
        pure_file_path = PurePath(file_path)
        if "_7z" in pure_file_path.parts[-1]:
            reconstructed_filename = pure_file_path.parts[-1].replace("_7z", ".7z")
            pure_file_path = PurePath("/").joinpath(
                *pure_file_path.parts[1:-1], reconstructed_filename
            )

        try:
            with py7zr.SevenZipFile(str(pure_file_path), mode="r") as extractor:
                extractor.extractall(path=temp_dir)
        except ArchiveError as e:
            raise ProcessingException(f"Error: {e}")

AccessIngestor

ingestors.tabular.access.AccessIngestor

File types

application/msaccess
application/x-msaccess
application/vnd.msaccess
application/vnd.ms-access
application/mdb
application/x-mdb

File extensions

.mdb

Bases: Ingestor, TableSupport, ShellSupport

Source code in ingestors/tabular/access.py

class AccessIngestor(Ingestor, TableSupport, ShellSupport):
    MIME_TYPES = [
        "application/msaccess",
        "application/x-msaccess",
        "application/vnd.msaccess",
        "application/vnd.ms-access",
        "application/mdb",
        "application/x-mdb",
    ]
    EXTENSIONS = ["mdb"]
    SCORE = 8

    def get_tables(self, local_path):
        mdb_tables = self.find_command("mdb-tables")
        if mdb_tables is None:
            raise RuntimeError("mdb-tools is not available")
        try:
            output = subprocess.check_output([mdb_tables, local_path])
            return [
                t.strip().decode("utf-8") for t in output.split(b" ") if len(t.strip())
            ]
        except subprocess.CalledProcessError as cpe:
            log.warning("Failed to open MDB: %s", cpe)
            raise ProcessingException("Failed to extract Access DB.") from cpe

    def generate_rows(self, file_path, table_name):
        mdb_export = self.find_command("mdb-export")
        if mdb_export is None:
            raise RuntimeError("mdb-tools is not available")
        args = [mdb_export, "-b", "strip", file_path, table_name]
        proc = subprocess.Popen(args, stdout=subprocess.PIPE)
        output = io.TextIOWrapper(proc.stdout, newline=os.linesep)
        headers = None
        for row in csv.reader((line for line in output), delimiter=","):
            if headers is None:
                headers = row
                continue
            yield OrderedDict(zip(headers, row))

    def ingest(self, file_path, entity):
        entity.schema = model.get("Workbook")
        for table_name in self.get_tables(file_path):
            table = self.manager.make_entity("Table", parent=entity)
            table.make_id(entity.id, table_name)
            table.set("title", table_name)
            # Emit a partial table fragment with parent reference and name
            # early, so that we don't have orphan fragments in case of an error
            # in the middle of processing.
            # See https://github.com/alephdata/ingest-file/issues/171
            self.manager.emit_entity(table, fragment="initial")
            rows = self.generate_rows(file_path, table_name)
            self.emit_row_dicts(table, rows)
            self.manager.emit_entity(table)

AudioIngestor

ingestors.media.audio.AudioIngestor

File types

audio/mpeg
audio/mp3
audio/x-m4a
audio/x-hx-aac-adts
audio/x-wav
audio/mp4
audio/ogg
audio/vnd.wav
audio/flac
audio/x-ms-wma
audio/webm

File extensions

.wav
.mp3
.aac
.ac3
.m4a
.m4b
.ogg
.opus
.flac
.wma

Bases: Ingestor, TimestampSupport, TranscriptionSupport

Source code in ingestors/media/audio.py

class AudioIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
    MIME_TYPES = [
        "audio/mpeg",
        "audio/mp3",
        "audio/x-m4a",
        "audio/x-hx-aac-adts",
        "audio/x-wav",
        "audio/mp4",
        "audio/ogg",
        "audio/vnd.wav",
        "audio/flac",
        "audio/x-ms-wma",
        "audio/webm",
    ]
    EXTENSIONS = [
        "wav",
        "mp3",
        "aac",
        "ac3",
        "m4a",
        "m4b",
        "ogg",
        "opus",
        "flac",
        "wma",
    ]
    SCORE = 3

    def ingest(self, file_path, entity):
        try:
            entity.schema = model.get("Audio")
            metadata = MediaInfo.parse(file_path)
            for track in metadata.tracks:
                entity.add("title", track.title)
                entity.add("generator", track.writing_application)
                entity.add("generator", track.writing_library)
                entity.add("generator", track.publisher)
                entity.add("authoredAt", self.parse_timestamp(track.recorded_date))
                entity.add("authoredAt", self.parse_timestamp(track.tagged_date))
                entity.add("authoredAt", self.parse_timestamp(track.encoded_date))
                modified_at = self.parse_timestamp(track.file_last_modification_date)
                entity.add("modifiedAt", modified_at)
                if track.sampling_rate:
                    entity.add("samplingRate", track.sampling_rate)
                entity.add("duration", track.duration)
        except Exception as ex:
            raise ProcessingException(f"Could not read audio: {ex}") from ex
        try:
            self.transcribe(self.manager.dataset, entity, self.manager.context)
        except Exception as ex:
            log.error(f"Could not queue audio for transcription: {ex}")

    @classmethod
    def match(cls, file_path, entity):
        score = super(AudioIngestor, cls).match(file_path, entity)
        if score <= 0:
            for mime_type in entity.get("mimeType"):
                if mime_type.startswith("audio/"):
                    return cls.SCORE * 2
        return score

BZ2Ingestor

ingestors.packages.BZ2Ingestor

File types

application/x-bzip
application/x-bzip2
multipart/x-bzip
multipart/x-bzip2

File extensions

.bz
.tbz
.bz2
.tbz2

Bases: SingleFilePackageIngestor

Source code in ingestors/packages/__init__.py

class BZ2Ingestor(SingleFilePackageIngestor):
    MIME_TYPES = [
        "application/x-bzip",
        "application/x-bzip2",
        "multipart/x-bzip",
        "multipart/x-bzip2",
    ]
    EXTENSIONS = ["bz", "tbz", "bz2", "tbz2"]

    def unpack_file(self, file_path, temp_file):
        try:
            with bz2.BZ2File(file_path) as src:
                with open(temp_file, "wb") as dst:
                    shutil.copyfileobj(src, dst)
        except IOError as ioe:
            raise ProcessingException("Error: %s" % ioe)

CalendarIngestor

ingestors.email.calendar.CalendarIngestor

File types

text/calendar

File extensions

.ics
.ical
.icalendar
.ifb

Bases: Ingestor, EncodingSupport

Source code in ingestors/email/calendar.py

class CalendarIngestor(Ingestor, EncodingSupport):
    MIME_TYPES = ["text/calendar"]
    EXTENSIONS = ["ics", "ical", "icalendar", "ifb"]
    SCORE = 10

    def address_entity(self, address):
        email = str(address).strip()
        if email.lower().startswith("mailto:"):
            email = address[len("mailto:") :]
        identity = EmailIdentity(self.manager, None, email)
        return identity.entity

    def ingest_component(self, entity, idx, comp):
        if comp.name == "VCALENDAR":
            entity.add("generator", comp.get("PRODID"))
        if comp.name == "VEVENT":
            event = self.manager.make_entity("Event")
            self.manager.apply_context(event, entity)
            uid = sanitize_text(comp.get("UID"))
            if uid is not None:
                event.make_id(uid)
            else:
                event.make_id(entity.id, idx)
            event.add("proof", entity)
            event.add("name", comp.get("SUMMARY"))
            event.add("description", comp.get("DESCRIPTION"))
            event.add("location", comp.get("LOCATION"))
            event.add("sourceUrl", comp.get("URL"))
            event.add("startDate", cal_date(comp.get("DTSTART")))
            event.add("endDate", cal_date(comp.get("DTEND")))
            event.add("date", cal_date(comp.get("CREATED")))
            event.add("modifiedAt", cal_date(comp.get("LAST-MODIFIED")))
            event.add("organizer", self.address_entity(comp.get("ORGANIZER")))
            for attendee in ensure_list(comp.get("ATTENDEE")):
                event.add("involved", self.address_entity(attendee))
            self.manager.emit_entity(event, fragment=idx)

    def ingest(self, file_path, entity):
        entity.schema = model.get("PlainText")
        entity.add("encoding", "utf-8")
        text = self.read_file_decoded(entity, file_path)
        entity.set("bodyText", text)
        try:
            calendar = icalendar.Calendar.from_ical(text)
            for idx, comp in enumerate(calendar.walk()):
                self.ingest_component(entity, idx, comp)
        except Exception as exc:
            raise ProcessingException("Failed to parse iCalendar") from exc

CSVIngestor

ingestors.tabular.csv.CSVIngestor

Decode and ingest a CSV file.

This expects a properly formatted CSV file with a header in the first row.

File types

text/csv
text/tsv
text/tab-separated-values

File extensions

.csv
.tsv

Bases: Ingestor, TableSupport

Decode and ingest a CSV file.

This expects a properly formatted CSV file with a header in the first row.

Source code in ingestors/tabular/csv.py

class CSVIngestor(Ingestor, TableSupport):
    """Decode and ingest a CSV file.

    This expects a properly formatted CSV file with a header in the first row.
    """

    MIME_TYPES = ["text/csv", "text/tsv", "text/tab-separated-values"]
    EXTENSIONS = ["csv", "tsv"]
    SCORE = 7

    def ingest(self, file_path, entity):
        entity.schema = model.get("Table")
        with io.open(file_path, "rb") as fh:
            encoding = self.detect_stream_encoding(fh)
            log.debug("Detected encoding [%r]: %s", entity, encoding)

        fh = io.open(file_path, "r", encoding=encoding, errors="replace")
        try:
            sample = fh.read(4096 * 10)
            fh.seek(0)
            dialect = csv.Sniffer().sniff(sample)
            reader = csv.reader(fh, dialect=dialect)
            self.emit_row_tuples(entity, reader)
        except (Exception, UnicodeDecodeError, csv.Error) as err:
            log.warning("CSV error: %s", err)
            raise ProcessingException("Invalid CSV: %s" % err) from err
        finally:
            fh.close()

DBFIngestor

ingestors.tabular.dbf.DBFIngestor

File types

application/dbase
application/x-dbase
application/dbf
application/x-dbf

File extensions

.dbf

Bases: Ingestor, TableSupport

Source code in ingestors/tabular/dbf.py

class DBFIngestor(Ingestor, TableSupport):
    MIME_TYPES = [
        "application/dbase",
        "application/x-dbase",
        "application/dbf",
        "application/x-dbf",
    ]
    EXTENSIONS = ["dbf"]
    BASE_SCORE = 8

    def generate_rows(self, table):
        headers = [stringify(h) for h in table.field_names]
        for row in table:
            try:
                yield OrderedDict(zip(headers, row))
            except Exception as ex:
                log.warning("Cannot decode DBF row: %s", ex)

    def ingest(self, file_path, entity):
        entity.schema = model.get("Table")
        try:
            table = Table(file_path.as_posix()).open()
            self.emit_row_dicts(entity, self.generate_rows(table))
        except DbfError as err:
            raise ProcessingException("Cannot open DBF file: %s" % err) from err

DjVuIngestor

ingestors.documents.djvu.DjVuIngestor

Read DejaVu E-Books.

File types

image/vnd.djvu
image/x.djvu
image/x-djvu
image/djvu

File extensions

Bases: Ingestor, PDFSupport, TempFileSupport

Read DejaVu E-Books.

Source code in ingestors/documents/djvu.py

class DjVuIngestor(Ingestor, PDFSupport, TempFileSupport):
    """Read DejaVu E-Books."""

    MIME_TYPES = [
        "image/vnd.djvu",
        "image/x.djvu",
        "image/x-djvu",
        "image/djvu",
    ]

    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        entity.schema = model.get("Pages")
        pdf_path = self.make_work_file("page.pdf")
        self.exec_command(
            "ddjvu", "-format=pdf", "-quality=100", "-skip", file_path, pdf_path
        )
        self.assert_outfile(pdf_path)
        self.pdf_alternative_extract(entity, pdf_path, self.manager)

`ingest(file_path, entity)`

Ingestor implementation.

Source code in ingestors/documents/djvu.py

def ingest(self, file_path, entity):
    """Ingestor implementation."""
    entity.schema = model.get("Pages")
    pdf_path = self.make_work_file("page.pdf")
    self.exec_command(
        "ddjvu", "-format=pdf", "-quality=100", "-skip", file_path, pdf_path
    )
    self.assert_outfile(pdf_path)
    self.pdf_alternative_extract(entity, pdf_path, self.manager)

AppleEmlxIngestor

ingestors.email.emlx.AppleEmlxIngestor

File types

File extensions

.emlx

Bases: RFC822Ingestor

Source code in ingestors/email/emlx.py

class AppleEmlxIngestor(RFC822Ingestor):
    MIME_TYPES = []
    EXTENSIONS = ["emlx"]
    SCORE = 8

    def ingest(self, file_path, entity):
        entity.schema = model.get("Email")
        try:
            with open(file_path, "rb") as fh:
                msg_len = int(fh.readline().strip())
                data = fh.read(msg_len)
                msg = email.message_from_bytes(data, policy=default)
        except (MessageError, ValueError, IndexError) as err:
            raise ProcessingException("Cannot parse email: %s" % err) from err

        self.ingest_msg(entity, msg)

GzipIngestor

ingestors.packages.GzipIngestor

File types

application/gzip
application/x-gzip
multipart/x-gzip

File extensions

.gz
.tgz

Bases: SingleFilePackageIngestor

Source code in ingestors/packages/__init__.py

class GzipIngestor(SingleFilePackageIngestor):
    MIME_TYPES = ["application/gzip", "application/x-gzip", "multipart/x-gzip"]
    EXTENSIONS = ["gz", "tgz"]

    def unpack_file(self, file_path, temp_file):
        try:
            with gzip.GzipFile(file_path) as src:
                with open(temp_file, "wb") as dst:
                    shutil.copyfileobj(src, dst)
        except IOError as ioe:
            raise ProcessingException("Error: %s" % ioe)

HTMLIngestor

ingestors.documents.html.HTMLIngestor

HTML file ingestor class. Extracts the text from the web page.

File types

text/html

File extensions

.htm
.html
.xhtml

Bases: Ingestor, EncodingSupport, HTMLSupport

HTML file ingestor class. Extracts the text from the web page.

Source code in ingestors/documents/html.py

class HTMLIngestor(Ingestor, EncodingSupport, HTMLSupport):
    "HTML file ingestor class. Extracts the text from the web page."

    MIME_TYPES = ["text/html"]
    EXTENSIONS = [
        "htm",
        "html",
        "xhtml",
    ]
    SCORE = 6

    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        entity.schema = model.get("HyperText")
        html_body = self.read_file_decoded(entity, file_path)
        self.extract_html_content(entity, html_body)

`ingest(file_path, entity)`

Ingestor implementation.

Source code in ingestors/documents/html.py

def ingest(self, file_path, entity):
    """Ingestor implementation."""
    entity.schema = model.get("HyperText")
    html_body = self.read_file_decoded(entity, file_path)
    self.extract_html_content(entity, html_body)

IgnoreIngestor

ingestors.ignore.IgnoreIngestor

File types

application/x-pkcs7-mime
application/pkcs7-mime
application/pkcs7-signature
application/x-pkcs7-signature
application/x-pkcs12application/pgp-encrypted
application/x-shockwave-flash
application/vnd.apple.pkpass
application/x-executable
application/x-mach-binary
application/x-sharedlib
application/x-dosexec
application/x-java-keystore
application/java-archive
application/font-sfnt
application/vnd.ms-office.vbaproject
application/x-x509-ca-cert
text/calendar
text/css
application/vnd.ms-opentype
application/x-font-ttf

File extensions

.json
.exe
.dll
.ini
.class
.jar
.psd
.indd
.sql
.dat
.log
.pbl
.p7m
.plist
.ics
.axd

Bases: Ingestor

Source code in ingestors/ignore.py

class IgnoreIngestor(Ingestor):
    MIME_TYPES = [
        "application/x-pkcs7-mime",
        "application/pkcs7-mime",
        "application/pkcs7-signature",
        "application/x-pkcs7-signature",
        "application/x-pkcs12" "application/pgp-encrypted",
        "application/x-shockwave-flash",
        "application/vnd.apple.pkpass",
        "application/x-executable",
        "application/x-mach-binary",
        "application/x-sharedlib",
        "application/x-dosexec",
        "application/x-java-keystore",
        "application/java-archive",
        "application/font-sfnt",
        "application/vnd.ms-office.vbaproject",
        "application/x-x509-ca-cert",
        "text/calendar",
        "text/css",
        "application/vnd.ms-opentype",
        "application/x-font-ttf",
    ]
    EXTENSIONS = [
        "json",
        "exe",
        "dll",
        "ini",
        "class",
        "jar",
        "psd",  # adobe photoshop
        "indd",  # adobe indesign
        "sql",
        "dat",
        "log",
        "pbl",
        "p7m",
        "plist",
        "ics",
        "axd",
    ]
    NAMES = [".DS_Store", "Thumbs.db", ".gitignore"]
    SCORE = 2

    def ingest(self, file_path, entity):
        log.info("[%r] will be ignored but stored.", entity)

    @classmethod
    def match(cls, file_path, entity):
        for file_size in entity.get("fileSize"):
            if int(file_size) == 0:
                return cls.SCORE * 100
        for file_name in entity.get("fileName"):
            if file_name in cls.NAMES:
                return cls.SCORE
        return super(IgnoreIngestor, cls).match(file_path, entity)

ImageIngestor

ingestors.media.image.ImageIngestor

Image file ingestor class. Extracts the text from images using OCR.

File types

image/x-portable-graymap
image/png
image/x-png
image/jpeg
image/jpg
image/gif
image/pjpeg
image/bmp
image/x-windows-bmp
image/x-portable-bitmap
image/x-coreldraw
application/postscript
image/vnd.dxf

File extensions

.jpg
.jpe
.jpeg
.png
.gif
.bmp

Bases: Ingestor, OCRSupport, TimestampSupport

Image file ingestor class. Extracts the text from images using OCR.

Source code in ingestors/media/image.py

class ImageIngestor(Ingestor, OCRSupport, TimestampSupport):
    """Image file ingestor class. Extracts the text from images using OCR."""

    MIME_TYPES = [
        "image/x-portable-graymap",
        "image/png",
        "image/x-png",
        "image/jpeg",
        "image/jpg",
        "image/gif",
        "image/pjpeg",
        "image/bmp",
        "image/x-windows-bmp",
        "image/x-portable-bitmap",
        "image/x-coreldraw",
        "application/postscript",
        "image/vnd.dxf",
    ]
    EXTENSIONS = ["jpg", "jpe", "jpeg", "png", "gif", "bmp"]
    SCORE = 10

    def extract_exif(self, img, entity):
        if not hasattr(img, "_getexif"):
            return

        exif = img._getexif()
        if exif is None:
            return

        for num, value in exif.items():
            try:
                tag = ExifTags.TAGS[num]
            except KeyError:
                log.warning("Unknown EXIF code: %s", num)
                continue
            if tag == "DateTimeOriginal":
                entity.add("authoredAt", self.parse_timestamp(value))
            if tag == "DateTime":
                entity.add("date", self.parse_timestamp(value))
            if tag == "Make":
                entity.add("generator", value)
            if tag == "Model":
                entity.add("generator", value)

    def ingest(self, file_path, entity):
        entity.schema = model.get("Image")
        with open(file_path, "rb") as fh:
            data = fh.read()

        try:
            image = Image.open(BytesIO(data))
            image.load()
            self.extract_exif(image, entity)
            languages = self.manager.context.get("languages")
            text = self.extract_ocr_text(data, languages=languages)
            entity.add("bodyText", text)
        except (OSError, IOError, Exception) as err:
            raise ProcessingException("Failed to open image: %s" % err)

    @classmethod
    def match(cls, file_path, entity):
        score = super(ImageIngestor, cls).match(file_path, entity)
        if score <= 0:
            for mime_type in entity.get("mimeType"):
                if mime_type.startswith("image/"):
                    score = cls.SCORE - 1
        return score

JSONIngestor

ingestors.misc.jsonfile.JSONIngestor

File types

application/json
text/javascript

File extensions

.json

Bases: Ingestor, EncodingSupport

Source code in ingestors/misc/jsonfile.py

class JSONIngestor(Ingestor, EncodingSupport):
    MIME_TYPES = [
        "application/json",
        "text/javascript",
    ]
    EXTENSIONS = ["json"]
    MAX_SIZE = 100 * MEGABYTE
    SCORE = 3

    def _collect_text(self, obj):
        if isinstance(obj, (list, set, tuple)):
            for item in obj:
                yield from self._collect_text(item)
        if isinstance(obj, dict):
            for item in obj.values():
                yield from self._collect_text(item)
        if isinstance(obj, str):
            yield obj

    def ingest(self, file_path, entity):
        for file_size in entity.get("fileSize"):
            if int(file_size) > self.MAX_SIZE:
                raise ProcessingException("JSON file is too large.")

        with open(file_path, "rb") as fh:
            encoding = self.detect_stream_encoding(fh)

        with open(file_path, "r", encoding=encoding) as fh:
            try:
                data = json.load(fh)
                for idx, text in enumerate(self._collect_text(data)):
                    self.manager.emit_text_fragment(entity, [text], idx)
            except Exception as exc:
                raise ProcessingException("Cannot parse JSON file: %s" % exc) from exc

MboxFileIngestor

ingestors.email.mbox.MboxFileIngestor

File types

application/mbox

File extensions

.mbox

Bases: RFC822Ingestor, TempFileSupport

Source code in ingestors/email/mbox.py

class MboxFileIngestor(RFC822Ingestor, TempFileSupport):
    DEFAULT_MIME = "application/mbox"
    MIME_TYPES = [DEFAULT_MIME]
    EXTENSIONS = ["mbox"]
    MAGIC = "From "
    SCORE = 6

    def ingest(self, file_path, entity):
        mbox = mailbox.mbox(file_path)
        entity.schema = model.get("Package")
        entity.add("mimeType", self.DEFAULT_MIME)

        for i, msg in enumerate(mbox.itervalues(), 1):
            # Is there a risk of https://bugs.python.org/issue27321 ?
            try:
                msg_path = self.make_work_file("%s.eml" % i)
                with open(msg_path, "wb") as fh:
                    gen = BytesGenerator(fh, policy=default)
                    gen.flatten(msg)
                checksum = self.manager.store(msg_path, mime_type=RFC822)
                msg_path.unlink()
                child = self.manager.make_entity("Email", parent=entity)
                child.make_id(checksum)
                child.add("contentHash", checksum)
                child.add("mimeType", RFC822)
                self.manager.queue_entity(child)
            except Exception:
                log.exception("[%r] Cannot extract message %s", entity, i)

    @classmethod
    def match(cls, file_path, entity):
        score = super(MboxFileIngestor, cls).match(file_path, entity)
        if score < 0:
            # this was added because a lot of mbox files are just called
            # 'inbox' or 'new', without a file suffix.
            with open(file_path, "rb") as fh:
                if fh.read(len(cls.MAGIC)) == cls.MAGIC:
                    mbox = mailbox.mbox(file_path)
                    for _ in mbox:
                        return cls.SCORE
        return score

RFC822Ingestor

ingestors.email.msg.RFC822Ingestor

File types

multipart/mixed
message/rfc822

File extensions

.eml
.rfc822
.email
.msg

Bases: Ingestor, EmailSupport, EncodingSupport

Source code in ingestors/email/msg.py

class RFC822Ingestor(Ingestor, EmailSupport, EncodingSupport):
    MIME_TYPES = ["multipart/mixed", "message/rfc822"]
    BODY_HTML = "text/html"
    BODY_PLAIN = "text/plain"
    BODY_TYPES = [BODY_HTML, BODY_PLAIN]
    BODY_RFC822 = "message/rfc822"
    DISPLAY_HEADERS = ["from", "to", "cc", "bcc", "subject", "reply-to", "date"]
    EXTENSIONS = ["eml", "rfc822", "email", "msg"]
    SCORE = 7

    def has_alternative(self, parent, content_type):
        if not parent:
            return False

        if normalize_mimetype(parent.get_content_type()) != "multipart/alternative":
            return False

        for part in parent.get_payload():
            if normalize_mimetype(part.get_content_type()) == content_type:
                return True

        return False

    def make_html_alternative(self, text):
        if not text:
            return None

        return escape(text).strip().replace("\n", "<br>")

    def decode_part(self, part):
        charset = part.get_content_charset()
        payload = part.get_payload(decode=True)
        return self.decode_string(payload, charset)

    def parse_html_part(self, entity, part, parent):
        payload = self.decode_part(part)
        text = self.extract_html_content(
            entity, payload, extract_metadata=False, add_index_text=False
        )

        if not self.has_alternative(parent, "text/plain"):
            entity.add("bodyText", text)

    def parse_plaintext_part(self, entity, part, parent):
        payload = self.decode_part(part)
        entity.add("bodyText", payload)

        if not self.has_alternative(parent, "text/html"):
            html = self.make_html_alternative(payload)
            entity.add("bodyHtml", html)

    def parse_rfc822_part(self, entity, part, parent):
        msg = part.get_payload(0)
        headers = [
            f"{name}: {value}"
            for name, value in msg.items()
            if name.lower() in self.DISPLAY_HEADERS
        ]
        text = "\n".join(headers)
        html = self.make_html_alternative(text)
        entity.add("bodyText", text)
        entity.add("bodyHtml", html)

        self.parse_parts(entity, part)

    def parse_part(self, entity, part, parent):
        mime_type = normalize_mimetype(part.get_content_type())
        file_name = part.get_filename()
        is_body_type = mime_type in self.BODY_TYPES
        is_attachment = part.is_attachment()
        is_attachment = is_attachment or file_name is not None
        is_attachment = is_attachment or (not is_body_type and not part.is_multipart())

        if is_attachment:
            if part.is_multipart():
                # The attachment is an email
                payload = str(part.get_payload(i=0))
            else:
                payload = part.get_payload(decode=True)
            self.ingest_attachment(entity, file_name, mime_type, payload)
            return

        if self.BODY_RFC822 in mime_type:
            self.parse_rfc822_part(entity, part, parent)
            return

        if part.is_multipart():
            self.parse_parts(entity, part)
            return

        if self.BODY_HTML in mime_type:
            self.parse_html_part(entity, part, parent)
            return

        if self.BODY_PLAIN in mime_type:
            self.parse_plaintext_part(entity, part, parent)
            return

        log.error("Dangling MIME fragment: %s", part)

    def parse_parts(self, entity, parent):
        for part in parent.get_payload():
            self.parse_part(entity, part, parent)

    def ingest_msg(self, entity, msg):
        self.extract_msg_headers(entity, msg)
        self.resolve_message_ids(entity)

        if msg.is_multipart():
            self.parse_parts(entity, msg)
        else:
            self.parse_part(entity, msg, None)

    def ingest(self, file_path, entity):
        entity.schema = model.get("Email")
        try:
            with open(file_path, "rb") as fh:
                msg = email.message_from_binary_file(fh, policy=default)
        except (MessageError, ValueError, IndexError) as err:
            raise ProcessingException("Cannot parse email: %s" % err) from err

        self.ingest_msg(entity, msg)

OpenOfficeSpreadsheetIngestor

ingestors.tabular.ods.OpenOfficeSpreadsheetIngestor

File types

application/vnd.oasis.opendocument.spreadsheet
application/vnd.oasis.opendocument.spreadsheet-template

File extensions

.ods
.ots

Bases: Ingestor, TableSupport, OpenDocumentSupport

Source code in ingestors/tabular/ods.py

class OpenOfficeSpreadsheetIngestor(Ingestor, TableSupport, OpenDocumentSupport):
    MIME_TYPES = [
        "application/vnd.oasis.opendocument.spreadsheet",
        "application/vnd.oasis.opendocument.spreadsheet-template",
    ]
    EXTENSIONS = ["ods", "ots"]
    SCORE = 7
    VALUE_FIELDS = ["date-value", "time-value", "boolean-value", "value"]

    def convert_cell(self, cell):
        cell_type = cell.getAttrNS(OFFICENS, "value-type")
        if cell_type == "currency":
            value = cell.getAttrNS(OFFICENS, "value")
            currency = cell.getAttrNS(OFFICENS, cell_type)
            if value is None:
                return None
            if currency is None:
                return value
            return value + " " + currency

        for field in self.VALUE_FIELDS:
            value = cell.getAttrNS(OFFICENS, field)
            if value is not None:
                return value

        return self.read_text_cell(cell)

    def read_text_cell(self, cell):
        content = []
        for paragraph in cell.getElementsByType(P):
            content.append(extractText(paragraph))
        return "\n".join(content)

    def generate_csv(self, table):
        for row in table.getElementsByType(TableRow):
            values = []
            for cell in row.getElementsByType(TableCell):
                repeat = cell.getAttribute("numbercolumnsrepeated") or 1
                value = self.convert_cell(cell)
                for i in range(int(repeat)):
                    values.append(value)
            yield values

    def ingest(self, file_path, entity):
        entity.schema = model.get("Workbook")
        doc = self.parse_opendocument(file_path, entity)
        for sheet in doc.spreadsheet.getElementsByType(Table):
            name = sheet.getAttribute("name")
            table = self.manager.make_entity("Table", parent=entity)
            table.make_id(entity.id, name)
            table.set("title", name)
            # Emit a partial table fragment with parent reference and name
            # early, so that we don't have orphan fragments in case of an error
            # in the middle of processing.
            # See https://github.com/alephdata/ingest-file/issues/171
            self.manager.emit_entity(table, fragment="initial")
            self.emit_row_tuples(table, self.generate_csv(sheet))
            if table.has("csvHash"):
                self.manager.emit_entity(table)

DocumentIngestor

ingestors.documents.office.DocumentIngestor

Office/Word document ingestor class.

Converts the document to PDF and extracts the text. Mostly a slightly adjusted PDF ingestor.

Requires system tools:

Open/Libre Office with dependencies
image ingestor dependencies to cover any embeded images OCR

File types

text/richtext
text/rtf
application/rtf
application/x-rtf
application/msword
application/vnd.ms-word
application/wordperfect
application/vnd.wordperfect
application/vnd.ms-powerpoint
application/vnd.sun.xml.impress
application/vnd.ms-powerpoint.presentation
application/vnd.ms-powerpoint.presentation.12
application/CDFV2-unknown
application/CDFV2-corruptapplication/clarisworks
application/epub+zip
application/macwriteii
application/msword
application/prs.plucker
application/vnd.corel-draw
application/vnd.lotus-wordpro
application/vnd.ms-powerpoint
application/vnd.ms-powerpoint.presentation.macroEnabled.main+xml
application/vnd.ms-works
application/vnd.palm
application/vnd.sun.xml.draw
application/vnd.sun.xml.draw.template
application/vnd.sun.xml.impress
application/vnd.sun.xml.impress.template
application/vnd.sun.xml.writer
application/vnd.sun.xml.writer.global
application/vnd.sun.xml.writer.template
application/vnd.sun.xml.writer.web
application/vnd.visio
application/vnd.wordperfect
application/x-abiword
application/x-aportisdoc
application/x-fictionbook+xml
application/x-hwp
application/x-iwork-keynote-sffkey
application/x-iwork-pages-sffpages
application/x-mspublisher
application/x-mswrite
application/x-pagemaker
application/x-sony-bbeb
application/x-t602
image/x-cmx
image/x-freehand
image/x-wpg

File extensions

.602
.abw
.cdr
.cmx
.cwk
.doc
.dot
.dps
.dpt
.epub
.fb2
.fh
.fh1
.fh10
.fh11
.fh2
.fh3
.fh4
.fh5
.fh6
.fh7
.fh8
.fh9
.fodg
.fodp
.fodt
.hwp
.key
.lrf
.lwp
.mcw
.mw
.mwd
.nxd
.odg
.odm
.otg
.oth
.otm
.otp
.ott
.p65
.pages
.pdb
.pm
.pm6
.pmd
.pot
.pps
.ppt
.pub
.qxd
.qxt
.rtf
.sda
.sdd
.sdw
.std
.sti
.stw
.sxd
.sxg
.sxi
.sxw
.vdx
.vsd
.vsdm
.vsdx
.wn
.wpd
.wpg
.wps
.wpt
.wri
.xlc
.xlm
.xls
.xlw
.zabw
.zmf

Bases: Ingestor, OLESupport, PDFSupport

Office/Word document ingestor class.

Converts the document to PDF and extracts the text. Mostly a slightly adjusted PDF ingestor.

Requires system tools:

Open/Libre Office with dependencies
image ingestor dependencies to cover any embeded images OCR

Source code in ingestors/documents/office.py

class DocumentIngestor(Ingestor, OLESupport, PDFSupport):
    """Office/Word document ingestor class.

    Converts the document to PDF and extracts the text.
    Mostly a slightly adjusted PDF ingestor.

    Requires system tools:

    - Open/Libre Office with dependencies
    - image ingestor dependencies to cover any embeded images OCR
    """

    MIME_TYPES = [
        # Text documents
        "text/richtext",
        "text/rtf",
        "application/rtf",
        "application/x-rtf",
        "application/msword",
        "application/vnd.ms-word",
        "application/wordperfect",
        "application/vnd.wordperfect",
        # Presentations
        "application/vnd.ms-powerpoint",
        "application/vnd.sun.xml.impress",
        "application/vnd.ms-powerpoint.presentation",
        "application/vnd.ms-powerpoint.presentation.12",
        # MS Office files with short stream missing
        "application/CDFV2-unknown",
        "application/CDFV2-corrupt" "application/clarisworks",  # ClarisWorks_Draw
        "application/epub+zip",  # EPUB Document
        "application/macwriteii",  # MacWrite
        "application/msword",  # MS Word 2007 XML VBA
        "application/prs.plucker",  # Plucker eBook
        "application/vnd.corel-draw",  # Corel Draw Document
        "application/vnd.lotus-wordpro",  # LotusWordPro
        "application/vnd.ms-powerpoint",  # MS PowerPoint 97 Vorlage
        "application/vnd.ms-powerpoint.presentation.macroEnabled.main+xml",  # Impress MS PowerPoint 2007 XML VBA  # noqa
        "application/vnd.ms-works",  # Mac_Works
        "application/vnd.palm",  # Palm_Text_Document
        "application/vnd.sun.xml.draw",  # StarOffice XML (Draw)
        "application/vnd.sun.xml.draw.template",  # draw_StarOffice_XML_Draw_Template  # noqa
        "application/vnd.sun.xml.impress",  # StarOffice XML (Impress)
        "application/vnd.sun.xml.impress.template",  # impress_StarOffice_XML_Impress_Template  # noqa
        "application/vnd.sun.xml.writer",  # StarOffice XML (Writer)
        "application/vnd.sun.xml.writer.global",  # writer_globaldocument_StarOffice_XML_Writer_GlobalDocument  # noqa
        "application/vnd.sun.xml.writer.template",  # writer_StarOffice_XML_Writer_Template  # noqa
        "application/vnd.sun.xml.writer.web",  # writer_web_StarOffice_XML_Writer_Web_Template  # noqa
        "application/vnd.visio",  # Visio Document
        "application/vnd.wordperfect",  # WordPerfect
        "application/x-abiword",  # AbiWord
        "application/x-aportisdoc",  # PalmDoc
        "application/x-fictionbook+xml",  # FictionBook 2
        "application/x-hwp",  # writer_MIZI_Hwp_97
        "application/x-iwork-keynote-sffkey",  # Apple Keynote
        "application/x-iwork-pages-sffpages",  # Apple Pages
        "application/x-mspublisher",  # Publisher Document
        "application/x-mswrite",  # MS_Write
        "application/x-pagemaker",  # PageMaker Document
        "application/x-sony-bbeb",  # BroadBand eBook
        "application/x-t602",  # T602Document
        "image/x-cmx",  # Corel Presentation Exchange
        "image/x-freehand",  # Freehand Document
        "image/x-wpg",  # WordPerfect Graphics
    ]
    EXTENSIONS = [
        "602",  # T602Document
        "abw",  # AbiWord
        "cdr",  # Corel Draw Document
        "cmx",  # Corel Presentation Exchange
        "cwk",  # ClarisWorks_Draw
        "doc",  # Mac_Word
        "dot",  # MS Word 97 Vorlage
        "dps",  # MS PowerPoint 97
        "dpt",  # MS PowerPoint 97 Vorlage
        "epub",  # EPUB Document
        "fb2",  # FictionBook 2
        "fh",  # Freehand Document
        "fh1",  # Freehand Document
        "fh10",  # Freehand Document
        "fh11",  # Freehand Document
        "fh2",  # Freehand Document
        "fh3",  # Freehand Document
        "fh4",  # Freehand Document
        "fh5",  # Freehand Document
        "fh6",  # Freehand Document
        "fh7",  # Freehand Document
        "fh8",  # Freehand Document
        "fh9",  # Freehand Document
        "fodg",  # OpenDocument Drawing Flat XML
        "fodp",  # OpenDocument Presentation Flat XML
        "fodt",  # OpenDocument Text Flat XML
        "hwp",  # writer_MIZI_Hwp_97
        "key",  # Apple Keynote
        "lrf",  # BroadBand eBook
        "lwp",  # LotusWordPro
        "mcw",  # MacWrite
        "mw",  # MacWrite
        "mwd",  # Mariner_Write
        "nxd",  # WriteNow
        "odg",  # draw8
        "odm",  # writerglobal8
        "otg",  # draw8_template
        "oth",  # writerweb8_writer_template
        "otm",  # writerglobal8_template
        "otp",  # impress8_template
        "ott",  # writer8_template
        "p65",  # PageMaker Document
        "pages",  # Apple Pages
        "pdb",  # Palm_Text_Document
        "pm",  # PageMaker Document
        "pm6",  # PageMaker Document
        "pmd",  # PageMaker Document
        "pot",  # PowerPoint 3
        "pps",  # MS PowerPoint 97 AutoPlay
        "ppt",  # PowerPoint 3
        # 'pptm',  # Impress Office Open XML
        "pub",  # Publisher Document
        "qxd",  # QXP Document
        "qxt",  # QXP Document
        "rtf",  # Rich Text Format
        "sda",  # StarOffice_Drawing
        "sdd",  # StarOffice_Presentation
        "sdw",  # StarOffice_Writer
        "std",  # draw_StarOffice_XML_Draw_Template
        "sti",  # impress_StarOffice_XML_Impress_Template
        "stw",  # writer_StarOffice_XML_Writer_Template
        "sxd",  # StarOffice XML (Draw)
        "sxg",  # writer_globaldocument_StarOffice_XML_Writer_GlobalDocument
        "sxi",  # StarOffice XML (Impress)
        "sxw",  # StarOffice XML (Writer)
        # 'tab',  # Text
        # 'tsv',  # Text
        # 'txt',  # Text
        "vdx",  # Visio Document
        "vsd",  # Visio Document
        "vsdm",  # Visio Document
        "vsdx",  # Visio Document
        "wn",  # WriteNow
        "wpd",  # WordPerfect
        "wpg",  # WordPerfect Graphics
        "wps",  # Mac_Works
        "wpt",  # MS Word 97 Vorlage
        "wri",  # MS_Write
        "xlc",  # MS Excel 95
        "xlm",  # MS Excel 95
        "xls",  # MS Excel 95
        "xlw",  # MS Excel 95
        # 'xml',  # OpenDocument Drawing Flat XML
        "zabw",  # AbiWord
        # 'zip',  # FictionBook 2
        "zmf",  # ZMF Document
    ]
    SCORE = 6

    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        entity.schema = model.get("Pages")
        self.extract_ole_metadata(file_path, entity)
        with tempfile.TemporaryDirectory() as unique_tmpdir:
            # TODO - write to logs the case in which the context manager can't delete these dirs
            pdf_path = self.document_to_pdf(unique_tmpdir, file_path, entity)
            self.pdf_alternative_extract(entity, pdf_path, self.manager)

`ingest(file_path, entity)`

Ingestor implementation.

Source code in ingestors/documents/office.py

def ingest(self, file_path, entity):
    """Ingestor implementation."""
    entity.schema = model.get("Pages")
    self.extract_ole_metadata(file_path, entity)
    with tempfile.TemporaryDirectory() as unique_tmpdir:
        # TODO - write to logs the case in which the context manager can't delete these dirs
        pdf_path = self.document_to_pdf(unique_tmpdir, file_path, entity)
        self.pdf_alternative_extract(entity, pdf_path, self.manager)

OutlookMsgIngestor

ingestors.email.outlookmsg.OutlookMsgIngestor

File types

application/msg
application/x-msg
application/vnd.ms-outlook
msg/rfc822

File extensions

.msg

Bases: Ingestor, EmailSupport, OLESupport, TempFileSupport

Source code in ingestors/email/outlookmsg.py

class OutlookMsgIngestor(Ingestor, EmailSupport, OLESupport, TempFileSupport):
    MIME_TYPES = [
        "application/msg",
        "application/x-msg",
        "application/vnd.ms-outlook",
        "msg/rfc822",
    ]
    EXTENSIONS = ["msg"]
    SCORE = 10

    def get_identity(self, name, email):
        return EmailIdentity(self.manager, name, email)

    def ingest(self, file_path, entity):
        entity.schema = model.get("Email")
        try:
            msg = Message(file_path.as_posix())
        except Exception as exc:
            msg = "Cannot open message file: %s" % exc
            raise ProcessingException(msg) from exc

        self.extract_olefileio_metadata(msg.ole, entity)
        self.ingest_message(msg, entity)

    def ingest_message(self, msg, entity):
        try:
            self.extract_msg_headers(entity, msg.header)
        except Exception:
            log.exception("Cannot parse Outlook-stored headers")

        entity.add("subject", msg.subject)
        entity.add("threadTopic", msg.getStringField("0070"))
        entity.add("encoding", msg.encoding)
        entity.add("bodyText", msg.body)
        entity.add("bodyHtml", msg.htmlBody)
        entity.add("messageId", self.parse_message_ids(msg.message_id))

        try:
            rtf_body = msg.rtfBody
        except Exception:
            log.exception("Cannot parse RTF body of the email")
            rtf_body = None

        if rtf_body is not None:
            rtf_path = self.make_work_file("body.rtf")
            with open(rtf_path, "wb") as fh:
                fh.write(rtf_body)
            checksum = self.manager.store(rtf_path, mime_type=RTF_MIME)
            rtf_path.unlink()

            child = self.manager.make_entity("Document", parent=entity)
            child.make_id(entity.id, "outlook-msg.rtf.body")
            child.add("fileName", "body.rtf")
            child.add("contentHash", checksum)
            child.add("mimeType", RTF_MIME)
            self.manager.queue_entity(child)

        if not entity.has("inReplyTo"):
            entity.add("inReplyTo", self.parse_references(msg.references, []))

        try:
            date = parsedate_to_datetime(msg.date).isoformat()
            entity.add("date", date)
        except Exception:
            log.warning("Could not parse date: %s", msg.date)

        # sender name and email
        sender = self.get_identities(msg.sender)
        self.apply_identities(entity, sender, "emitters", "sender")

        # received by
        sender = self.get_identity(
            msg.getStringField("0040"), msg.getStringField("0076")
        )
        self.apply_identities(entity, sender, "emitters")

        froms = self.get_identities(msg.getStringField("1046"))
        self.apply_identities(entity, froms, "emitters", "from")

        tos = self.get_identities(msg.to)
        self.apply_identities(entity, tos, "recipients", "to")

        ccs = self.get_identities(msg.cc)
        self.apply_identities(entity, ccs, "recipients", "cc")

        bccs = self.get_identities(msg.bcc)
        self.apply_identities(entity, bccs, "recipients", "bcc")

        self.resolve_message_ids(entity)
        for attachment in msg.attachments:
            if attachment.type == "msg":
                child = self.manager.make_entity("Email", parent=entity)
                child.make_id(entity.id, attachment.data.prefix)
                child.add("fileName", attachment.long_filename)
                child.add("fileName", attachment.short_filename)
                child.add("mimeType", "application/vnd.ms-outlook")
                self.ingest_message(attachment.data, child)
                self.manager.emit_entity(child, fragment=attachment.data.prefix)
            if attachment.type == "data":
                name = stringify(attachment.long_filename)
                name = name or stringify(attachment.short_filename)
                self.ingest_attachment(
                    entity, name, attachment.content_type, attachment.data
                )

    @classmethod
    def match(cls, file_path, entity):
        score = super(OutlookMsgIngestor, cls).match(file_path, entity)
        if score > 0 and not isOleFile(file_path):
            return -1
        return score

OutlookOLMArchiveIngestor

ingestors.email.olm.OutlookOLMArchiveIngestor

File types

File extensions

.olm

Bases: Ingestor, TempFileSupport, XMLSupport

Source code in ingestors/email/olm.py

class OutlookOLMArchiveIngestor(Ingestor, TempFileSupport, XMLSupport):
    MIME_TYPES = []
    EXTENSIONS = ["olm"]
    SCORE = 10
    EXCLUDE = ["com.microsoft.__Messages"]

    def extract_file(self, zipf, name):
        """Extract a message file from the OLM zip archive"""
        path = pathlib.Path(name)
        base_name = safe_filename(path.name)
        out_file = self.make_work_file(base_name)
        with open(out_file, "w+b") as outfh:
            try:
                with zipf.open(name) as infh:
                    shutil.copyfileobj(infh, outfh)
            except KeyError:
                log.warning("Cannot load zip member: %s", name)
        return out_file

    def extract_hierarchy(self, entity, name):
        """Given a file path, create all its ancestor folders as entities"""
        foreign_id = pathlib.PurePath(entity.id)
        path = ensure_path(name)
        for name in path.as_posix().split("/")[:-1]:
            foreign_id = foreign_id.joinpath(name)
            if name in self.EXCLUDE:
                continue
            entity = self.manager.make_entity("Folder", parent=entity)
            entity.add("fileName", name)
            entity.make_id(foreign_id.as_posix())
            self.manager.emit_entity(entity)
        return entity

    def extract_attachment(self, zipf, message, attachment):
        """Create an entity for an attachment; assign its parent and put it
        on the task queue to be processed"""
        url = attachment.get("OPFAttachmentURL")
        name = attachment.get("OPFAttachmentName")
        name = name or attachment.get("OPFAttachmentContentID")
        child = self.manager.make_entity("Document", parent=message)
        if url is not None:
            file_path = self.extract_file(zipf, url)
            mime_type = attachment.get("OPFAttachmentContentType")
            checksum = self.manager.store(file_path, mime_type=mime_type)
            child.make_id(name, checksum)
            child.add("fileName", attachment.get("OPFAttachmentName"))
            child.add("fileName", attachment.get("OPFAttachmentContentID"))
            child.add("mimeType", mime_type)
            child.add("contentHash", checksum)
            self.manager.queue_entity(child)

    def extract_message(self, root, zipf, name):
        # Individual messages are stored as message_xxx.xml files. We want to
        # process these files and skip the others
        if "message_" not in name or not name.endswith(".xml"):
            return
        # Create the parent folders as entities with proper hierarchy
        parent = self.extract_hierarchy(root, name)
        # Extract the xml file itself and put it on the task queue to be
        # ingested by OutlookOLMMessageIngestor as an individual message
        xml_path = self.extract_file(zipf, name)
        checksum = self.manager.store(xml_path, mime_type=MIME)
        child = self.manager.make_entity("Document", parent=parent)
        child.make_id(checksum)
        child.add("contentHash", checksum)
        child.add("mimeType", MIME)
        self.manager.queue_entity(child)
        try:
            doc = self.parse_xml_path(xml_path)
            # find all attachments mentioned in the current xml file, assign
            # them their parent and put them on the queue to be processed
            for el in doc.findall(".//messageAttachment"):
                self.extract_attachment(zipf, child, el)
        except ProcessingException:
            pass

    def ingest(self, file_path, entity):
        entity.schema = model.get("Package")
        self._hierarchy = {}
        try:
            # OLM files are zip archives with emails stored as xml files
            with zipfile.ZipFile(file_path, "r") as zipf:
                for name in zipf.namelist():
                    try:
                        self.extract_message(entity, zipf, name)
                    except Exception:
                        log.exception("Error processing message: %s", name)
        except zipfile.BadZipfile:
            raise ProcessingException("Invalid OLM file.")

`extract_attachment(zipf, message, attachment)`

Create an entity for an attachment; assign its parent and put it on the task queue to be processed

Source code in ingestors/email/olm.py

def extract_attachment(self, zipf, message, attachment):
    """Create an entity for an attachment; assign its parent and put it
    on the task queue to be processed"""
    url = attachment.get("OPFAttachmentURL")
    name = attachment.get("OPFAttachmentName")
    name = name or attachment.get("OPFAttachmentContentID")
    child = self.manager.make_entity("Document", parent=message)
    if url is not None:
        file_path = self.extract_file(zipf, url)
        mime_type = attachment.get("OPFAttachmentContentType")
        checksum = self.manager.store(file_path, mime_type=mime_type)
        child.make_id(name, checksum)
        child.add("fileName", attachment.get("OPFAttachmentName"))
        child.add("fileName", attachment.get("OPFAttachmentContentID"))
        child.add("mimeType", mime_type)
        child.add("contentHash", checksum)
        self.manager.queue_entity(child)

`extract_file(zipf, name)`

Extract a message file from the OLM zip archive

Source code in ingestors/email/olm.py

def extract_file(self, zipf, name):
    """Extract a message file from the OLM zip archive"""
    path = pathlib.Path(name)
    base_name = safe_filename(path.name)
    out_file = self.make_work_file(base_name)
    with open(out_file, "w+b") as outfh:
        try:
            with zipf.open(name) as infh:
                shutil.copyfileobj(infh, outfh)
        except KeyError:
            log.warning("Cannot load zip member: %s", name)
    return out_file

`extract_hierarchy(entity, name)`

Given a file path, create all its ancestor folders as entities

Source code in ingestors/email/olm.py

def extract_hierarchy(self, entity, name):
    """Given a file path, create all its ancestor folders as entities"""
    foreign_id = pathlib.PurePath(entity.id)
    path = ensure_path(name)
    for name in path.as_posix().split("/")[:-1]:
        foreign_id = foreign_id.joinpath(name)
        if name in self.EXCLUDE:
            continue
        entity = self.manager.make_entity("Folder", parent=entity)
        entity.add("fileName", name)
        entity.make_id(foreign_id.as_posix())
        self.manager.emit_entity(entity)
    return entity

OfficeOpenXMLIngestor

ingestors.documents.ooxml.OfficeOpenXMLIngestor

Office/Word document ingestor class.

Converts the document to PDF and extracts the text. Mostly a slightly adjusted PDF ingestor.

File types

application/vnd.openxmlformats-officedocument.wordprocessingml.document
application/vnd.openxmlformats-officedocument.wordprocessingml.template
application/vnd.openxmlformats-officedocument.presentationml.slideshow
application/vnd.openxmlformats-officedocument.presentationml.presentation
application/vnd.openxmlformats-officedocument.presentationml.template
application/vnd.openxmlformats-officedocument.presentationml.slideshow

File extensions

.docx
.docm
.dotx
.dotm
.potx
.pptx
.ppsx
.pptm
.ppsm
.potm

Bases: Ingestor, OOXMLSupport, PDFSupport

Office/Word document ingestor class.

Converts the document to PDF and extracts the text. Mostly a slightly adjusted PDF ingestor.

Source code in ingestors/documents/ooxml.py

class OfficeOpenXMLIngestor(Ingestor, OOXMLSupport, PDFSupport):
    """Office/Word document ingestor class.

    Converts the document to PDF and extracts the text.
    Mostly a slightly adjusted PDF ingestor.
    """

    PREFIX = "application/vnd.openxmlformats-officedocument."
    MIME_TYPES = [
        PREFIX + "wordprocessingml.document",
        PREFIX + "wordprocessingml.template",
        PREFIX + "presentationml.slideshow",
        PREFIX + "presentationml.presentation",
        PREFIX + "presentationml.template",
        PREFIX + "presentationml.slideshow",
    ]
    EXTENSIONS = [
        "docx",
        "docm",
        "dotx",
        "dotm",
        "potx",
        "pptx",
        "ppsx",
        "pptm",
        "ppsm",
        "potm",
    ]
    SCORE = 7

    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        entity.schema = model.get("Pages")
        self.ooxml_extract_metadata(file_path, entity)
        with tempfile.TemporaryDirectory() as unique_tmpdir:
            # TODO - write to logs the case in which the context manager can't delete these dirs
            pdf_path = self.document_to_pdf(unique_tmpdir, file_path, entity)
            self.pdf_alternative_extract(entity, pdf_path, self.manager)

    @classmethod
    def match(cls, file_path, entity):
        score = super(OfficeOpenXMLIngestor, cls).match(file_path, entity)
        if score > 0 and cls.inspect_ooxml_manifest(file_path):
            score = cls.SCORE * 2
        return score

`ingest(file_path, entity)`

Ingestor implementation.

Source code in ingestors/documents/ooxml.py

def ingest(self, file_path, entity):
    """Ingestor implementation."""
    entity.schema = model.get("Pages")
    self.ooxml_extract_metadata(file_path, entity)
    with tempfile.TemporaryDirectory() as unique_tmpdir:
        # TODO - write to logs the case in which the context manager can't delete these dirs
        pdf_path = self.document_to_pdf(unique_tmpdir, file_path, entity)
        self.pdf_alternative_extract(entity, pdf_path, self.manager)

OpenDocumentIngestor

ingestors.documents.opendoc.OpenDocumentIngestor

Office/Word document ingestor class.

Converts the document to PDF and extracts the text. Mostly a slightly adjusted PDF ingestor.

Requires system tools:

Open/Libre Office with dependencies
image ingestor dependencies to cover any embeded images OCR

File types

application/vnd.oasis.opendocument.text
application/vnd.oasis.opendocument.text-template
application/vnd.oasis.opendocument.presentation
application/vnd.oasis.opendocument.graphics
application/vnd.oasis.opendocument.graphics-flat-xml
application/vnd.oasis.opendocument.graphics-templateapplication/vnd.oasis.opendocument.presentation-flat-xml
application/vnd.oasis.opendocument.presentation-template
application/vnd.oasis.opendocument.chart
application/vnd.oasis.opendocument.chart-template
application/vnd.oasis.opendocument.image
application/vnd.oasis.opendocument.image-template
application/vnd.oasis.opendocument.formula
application/vnd.oasis.opendocument.formula-template
application/vnd.oasis.opendocument.text-flat-xml
application/vnd.oasis.opendocument.text-master
application/vnd.oasis.opendocument.text-web

File extensions

.odt
.odp
.otp

Bases: Ingestor, OpenDocumentSupport, PDFSupport

Office/Word document ingestor class.

Converts the document to PDF and extracts the text. Mostly a slightly adjusted PDF ingestor.

Requires system tools:

Open/Libre Office with dependencies
image ingestor dependencies to cover any embeded images OCR

Source code in ingestors/documents/opendoc.py

class OpenDocumentIngestor(Ingestor, OpenDocumentSupport, PDFSupport):
    """Office/Word document ingestor class.

    Converts the document to PDF and extracts the text.
    Mostly a slightly adjusted PDF ingestor.

    Requires system tools:

    - Open/Libre Office with dependencies
    - image ingestor dependencies to cover any embeded images OCR

    """

    MIME_TYPES = [
        "application/vnd.oasis.opendocument.text",
        "application/vnd.oasis.opendocument.text-template",
        "application/vnd.oasis.opendocument.presentation",
        "application/vnd.oasis.opendocument.graphics",
        "application/vnd.oasis.opendocument.graphics-flat-xml",
        "application/vnd.oasis.opendocument.graphics-template"
        "application/vnd.oasis.opendocument.presentation-flat-xml",
        "application/vnd.oasis.opendocument.presentation-template",
        "application/vnd.oasis.opendocument.chart",
        "application/vnd.oasis.opendocument.chart-template",
        "application/vnd.oasis.opendocument.image",
        "application/vnd.oasis.opendocument.image-template",
        "application/vnd.oasis.opendocument.formula",
        "application/vnd.oasis.opendocument.formula-template",
        "application/vnd.oasis.opendocument.text-flat-xml",
        "application/vnd.oasis.opendocument.text-master",
        "application/vnd.oasis.opendocument.text-web",
    ]
    EXTENSIONS = ["odt", "odp", "otp"]
    SCORE = 7

    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        entity.schema = model.get("Pages")
        self.parse_opendocument(file_path, entity)
        with tempfile.TemporaryDirectory() as unique_tmpdir:
            # TODO - write to logs the case in which the context manager can't delete these dirs
            pdf_path = self.document_to_pdf(unique_tmpdir, file_path, entity)
            self.pdf_alternative_extract(entity, pdf_path, self.manager)

`ingest(file_path, entity)`

Ingestor implementation.

Source code in ingestors/documents/opendoc.py

def ingest(self, file_path, entity):
    """Ingestor implementation."""
    entity.schema = model.get("Pages")
    self.parse_opendocument(file_path, entity)
    with tempfile.TemporaryDirectory() as unique_tmpdir:
        # TODO - write to logs the case in which the context manager can't delete these dirs
        pdf_path = self.document_to_pdf(unique_tmpdir, file_path, entity)
        self.pdf_alternative_extract(entity, pdf_path, self.manager)

OutlookOLMMessageIngestor

ingestors.email.olm.OutlookOLMMessageIngestor

File types

application/xml+opfmessage

File extensions

Bases: Ingestor, XMLSupport, EmailSupport, TimestampSupport

Source code in ingestors/email/olm.py

class OutlookOLMMessageIngestor(Ingestor, XMLSupport, EmailSupport, TimestampSupport):
    MIME_TYPES = [MIME]
    EXTENSIONS = []
    SCORE = 15

    def get_contacts(self, doc, tag):
        path = "./%s/emailAddress" % tag
        for address in doc.findall(path):
            name = address.get("OPFContactEmailAddressName")
            email = address.get("OPFContactEmailAddressAddress")
            yield EmailIdentity(self.manager, name, email)

    def get_date(self, props, tag):
        return self.parse_timestamp(props.pop(tag, None))

    def ingest(self, file_path, entity):
        entity.schema = model.get("Email")
        try:
            doc = self.parse_xml_path(file_path)
        except TypeError as terr:
            raise ProcessingException("Cannot parse OPF XML file.") from terr

        if len(doc.findall(".//email")) != 1:
            raise ProcessingException("More than one email in file.")

        email = doc.find(".//email")
        props = email.getchildren()
        props = {c.tag: stringify(c.text) for c in props if c.text}
        # from pprint import pformat
        # log.info(pformat(props))

        entity.add("subject", props.pop("OPFMessageCopySubject", None))
        entity.add("threadTopic", props.pop("OPFMessageCopyThreadTopic", None))
        entity.add("summary", props.pop("OPFMessageCopyPreview", None))
        # message IDs are already parsed, no need to clean prior:
        entity.add("messageId", props.pop("OPFMessageCopyMessageID", None))
        entity.add("date", self.get_date(props, "OPFMessageCopySentTime"))
        entity.add("modifiedAt", self.get_date(props, "OPFMessageCopyModDate"))

        senders = self.get_contacts(email, "OPFMessageCopySenderAddress")
        self.apply_identities(entity, senders, "emitters", "sender")

        froms = self.get_contacts(  # codespell:ignore
            email, "OPFMessageCopyFromAddresses"
        )
        self.apply_identities(entity, froms, "emitters", "from")  # codespell:ignore

        tos = self.get_contacts(email, "OPFMessageCopyToAddresses")
        self.apply_identities(entity, tos, "recipients", "to")

        ccs = self.get_contacts(email, "OPFMessageCopyCCAddresses")
        self.apply_identities(entity, ccs, "recipients", "cc")

        bccs = self.get_contacts(email, "OPFMessageCopyBCCAddresses")
        self.apply_identities(entity, bccs, "recipients", "bcc")

        entity.add("bodyText", props.pop("OPFMessageCopyBody", None))
        html = props.pop("OPFMessageCopyHTMLBody", None)
        has_html = "1E0" == props.pop("OPFMessageGetHasHTML", None)
        if has_html and stringify(html):
            self.extract_html_content(entity, html, extract_metadata=False)

        self.resolve_message_ids(entity)

PDFIngestor

ingestors.documents.pdf.PDFIngestor

PDF file ingestor class.

Extracts the text from the document by converting it first to XML. Splits the file into pages.

File types

application/pdf

File extensions

.pdf

Bases: Ingestor, PDFSupport

PDF file ingestor class.

Extracts the text from the document by converting it first to XML. Splits the file into pages.

Source code in ingestors/documents/pdf.py

class PDFIngestor(Ingestor, PDFSupport):
    """PDF file ingestor class.

    Extracts the text from the document by converting it first to XML.
    Splits the file into pages.
    """

    MAGIC = "%PDF-1."
    MIME_TYPES = ["application/pdf"]
    EXTENSIONS = ["pdf"]
    SCORE = 6

    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        try:
            self.parse_and_ingest(file_path, entity, self.manager)
        except UnauthorizedError as pwe:
            raise ProcessingException(ENCRYPTED_MSG) from pwe
        except Exception as ex:
            raise ProcessingException("Could not extract PDF file: %r" % ex) from ex

    @classmethod
    def match(cls, file_path, entity):
        score = super(PDFIngestor, cls).match(file_path, entity)
        if score <= 0:
            with open(file_path, "rb") as fh:
                if fh.read(len(cls.MAGIC)) == cls.MAGIC:
                    return cls.SCORE * 2
        return score

`ingest(file_path, entity)`

Ingestor implementation.

Source code in ingestors/documents/pdf.py

def ingest(self, file_path, entity):
    """Ingestor implementation."""
    try:
        self.parse_and_ingest(file_path, entity, self.manager)
    except UnauthorizedError as pwe:
        raise ProcessingException(ENCRYPTED_MSG) from pwe
    except Exception as ex:
        raise ProcessingException("Could not extract PDF file: %r" % ex) from ex

PlainTextIngestor

ingestors.documents.plain.PlainTextIngestor

Plan text file ingestor class.

Extracts the text from the document and enforces unicode on it.

File types

text/plain
text/x-c
text/x-c++
text/x-diff
text/x-python
text/x-shellscript
text/x-java
text/x-php
text/troff
text/x-ruby
text/x-pascal
text/x-msdos-batch
text/x-yaml
text/x-makefile
text/x-perl
text/x-objective-c
text/x-msdos-batch
text/x-asm
text/x-csrc
text/x-sh
text/javascript
text/x-algol68

File extensions

.txt
.md
.rst
.nfo

Bases: Ingestor, EncodingSupport

Plan text file ingestor class.

Extracts the text from the document and enforces unicode on it.

Source code in ingestors/documents/plain.py

class PlainTextIngestor(Ingestor, EncodingSupport):
    """Plan text file ingestor class.

    Extracts the text from the document and enforces unicode on it.
    """

    MIME_TYPES = [
        "text/plain",
        "text/x-c",
        "text/x-c++",
        "text/x-diff",
        "text/x-python",
        "text/x-shellscript",
        "text/x-java",
        "text/x-php",
        "text/troff",
        "text/x-ruby",
        "text/x-pascal",
        "text/x-msdos-batch",
        "text/x-yaml",
        "text/x-makefile",
        "text/x-perl",  # %^&%*^&%*%^
        "text/x-objective-c",
        "text/x-msdos-batch",
        "text/x-asm",
        "text/x-csrc",
        "text/x-sh",
        "text/javascript",
        "text/x-algol68",
    ]
    EXTENSIONS = ["txt", "md", "rst", "nfo"]
    MAX_SIZE = 4 * 1024 * 1024
    SCORE = 1

    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        entity.schema = model.get("PlainText")
        for file_size in entity.get("fileSize"):
            if int(file_size) > self.MAX_SIZE:
                raise ProcessingException("Text file is too large.")

        text = self.read_file_decoded(entity, file_path)
        entity.set("bodyText", text)

`ingest(file_path, entity)`

Ingestor implementation.

Source code in ingestors/documents/plain.py

def ingest(self, file_path, entity):
    """Ingestor implementation."""
    entity.schema = model.get("PlainText")
    for file_size in entity.get("fileSize"):
        if int(file_size) > self.MAX_SIZE:
            raise ProcessingException("Text file is too large.")

    text = self.read_file_decoded(entity, file_path)
    entity.set("bodyText", text)

OutlookPSTIngestor

ingestors.email.outlookpst.OutlookPSTIngestor

File types

application/vnd.ms-outlook

File extensions

.pst
.ost
.pab

Bases: Ingestor, TempFileSupport, OLESupport, ShellSupport

Source code in ingestors/email/outlookpst.py

class OutlookPSTIngestor(Ingestor, TempFileSupport, OLESupport, ShellSupport):
    MIME_TYPES = ["application/vnd.ms-outlook"]
    EXTENSIONS = ["pst", "ost", "pab"]
    BASE_SCORE = 5
    COMMAND_TIMEOUT = 12 * 60 * 60

    def ingest(self, file_path, entity):
        entity.schema = model.get("Package")
        self.extract_ole_metadata(file_path, entity)
        temp_dir = self.make_empty_directory()
        try:
            self.exec_command(
                "readpst",
                "-e",  # make subfolders, files per message
                "-D",  # include deleted
                "-8",  # utf-8 where possible
                "-cv",  # export vcards
                "-o",
                temp_dir,
                file_path,
            )
            self.manager.delegate(DirectoryIngestor, temp_dir, entity)
        except Exception:
            log.exception("Failed to unpack PST.")
            # Handle partially extracted archives.
            self.manager.delegate(DirectoryIngestor, temp_dir, entity)
            raise

RARIngestor

ingestors.packages.rar.RARIngestor

File types

application/rarapplication/x-rar

File extensions

.rar

Bases: PackageSupport, Ingestor

Source code in ingestors/packages/rar.py

class RARIngestor(PackageSupport, Ingestor):
    MIME_TYPES = ["application/rar" "application/x-rar"]
    EXTENSIONS = ["rar"]
    SCORE = 4

    def unpack(self, file_path, entity, temp_dir):
        # FIXME: need to figure out how to unpack multi-part files.
        try:
            with rarfile.RarFile(file_path.as_posix()) as rf:
                names = rf.namelist()
                encoding = self.detect_list_encoding(names)
                log.debug("Detected filename encoding: %s", encoding)

                for name in names:
                    try:
                        fh = rf.open(name)
                        self.extract_member(temp_dir, name, fh, encoding=encoding)
                    except Exception as exc:
                        # TODO: should this be a fatal error?
                        log.warning("Failed to unpack [%s]: %s", name, exc)
        except rarfile.NeedFirstVolume as nfv:
            raise ProcessingException("Cannot load RAR partials") from nfv
        except rarfile.PasswordRequired as pr:
            raise ProcessingException(str(pr)) from pr
        except (rarfile.Error, TypeError) as err:
            raise ProcessingException("Invalid RAR file: %s" % err) from err

    @classmethod
    def match(cls, file_path, entity):
        # doesn't accept pathlib.Path object
        if rarfile.is_rarfile(file_path.as_posix()):
            return cls.SCORE
        return super(RARIngestor, cls).match(file_path, entity)

SQLiteIngestor

ingestors.tabular.sqlite.SQLiteIngestor

File types

application/x-sqlite3
application/x-sqlite
application/sqlite3
application/sqlite

File extensions

.sqlite3
.sqlite
.db

Bases: Ingestor, TableSupport

Source code in ingestors/tabular/sqlite.py

class SQLiteIngestor(Ingestor, TableSupport):
    VALID_TABLE = re.compile(r"[\w\d\_\-]{2,4096}")
    MIME_TYPES = [
        "application/x-sqlite3",
        "application/x-sqlite",
        "application/sqlite3",
        "application/sqlite",
    ]
    EXTENSIONS = ["sqlite3", "sqlite", "db"]
    SCORE = 8

    def get_tables(self, conn):
        c = conn.cursor()
        c.execute("SELECT name FROM sqlite_master WHERE type = 'table';")
        for (name,) in c.fetchall():
            if self.VALID_TABLE.match(name):
                yield name

    def generate_rows(self, conn, table):
        cur = conn.cursor()
        try:
            # FIXME make this a parameter somehow.
            # see https://stackoverflow.com/questions/39196462
            cur.execute("SELECT * FROM %s;" % table)
        except sqlite3.OperationalError as oe:
            log.warning("SQLite error: %s", oe)
            raise ProcessingException("Cannot query table: %s" % table) from oe

        headers = [i[0] for i in cur.description]
        while True:
            try:
                row = cur.fetchone()
                if row is None:
                    return
                yield OrderedDict(zip(headers, row))
            except sqlite3.OperationalError as oe:
                log.warning("SQLite error: %s", oe)

    def ingest(self, file_path, entity):
        entity.schema = model.get("Workbook")
        conn = sqlite3.connect(file_path)
        try:
            for table_name in self.get_tables(conn):
                table = self.manager.make_entity("Table", parent=entity)
                table.make_id(entity, table_name)
                table.set("title", table_name)
                # Emit a partial table fragment with parent reference and name
                # early, so that we don't have orphan fragments in case of an error
                # in the middle of processing.
                # See https://github.com/alephdata/ingest-file/issues/171
                self.manager.emit_entity(table, fragment="initial")
                rows = self.generate_rows(conn, table_name)
                self.emit_row_dicts(table, rows)
                self.manager.emit_entity(table)
        finally:
            conn.close()

    @classmethod
    def match(cls, file_path, entity):
        score = super(SQLiteIngestor, cls).match(file_path, entity)
        if score > 0:
            try:
                conn = sqlite3.connect(file_path)
                conn.execute("SELECT * FROM sqlite_master;").fetchall()
                return score
            except Exception:
                pass
        return -1

SVGIngestor

ingestors.media.svg.SVGIngestor

File types

image/svg+xml

File extensions

.svg

Bases: Ingestor, EncodingSupport, HTMLSupport

Source code in ingestors/media/svg.py

class SVGIngestor(Ingestor, EncodingSupport, HTMLSupport):
    MIME_TYPES = ["image/svg+xml"]
    EXTENSIONS = ["svg"]
    SCORE = 20

    def ingest(self, file_path, entity):
        entity.schema = model.get("HyperText")
        html_body = self.read_file_decoded(entity, file_path)
        text = self.extract_html_content(entity, html_body)
        entity.add("bodyText", text)

TarIngestor

ingestors.packages.tar.TarIngestor

File types

application/tar
application/x-tar
application/x-tgz
application/x-gtar

File extensions

.tar

Bases: PackageSupport, Ingestor

Source code in ingestors/packages/tar.py

class TarIngestor(PackageSupport, Ingestor):
    MIME_TYPES = [
        "application/tar",
        "application/x-tar",
        "application/x-tgz",
        "application/x-gtar",
    ]
    EXTENSIONS = ["tar"]
    SCORE = 4

    def unpack(self, file_path, entity, temp_dir):
        try:
            with tarfile.open(name=file_path, mode="r:*") as tf:
                names = tf.getnames()
                encoding = self.detect_list_encoding(names, default=tf.encoding)
                log.debug("Detected filename encoding: %s", encoding)

                for name in names:
                    try:
                        fh = tf.extractfile(name)
                        self.extract_member(temp_dir, name, fh, encoding=encoding)
                    except Exception:
                        # TODO: should this be a fatal error?
                        log.exception("Failed to unpack: %r", name)
        except (tarfile.TarError, IOError, EOFError) as err:
            raise ProcessingException("Invalid Tar file: %s" % err) from err

    @classmethod
    def match(cls, file_path, entity):
        if tarfile.is_tarfile(file_path):
            return cls.SCORE
        return super(TarIngestor, cls).match(file_path, entity)

TIFFIngestor

ingestors.media.tiff.TIFFIngestor

TIFF appears to not really be an image format. Who knew?

File types

image/tiff
image/x-tiff

File extensions

.tif
.tiff

Bases: Ingestor, PDFSupport, TempFileSupport, ShellSupport

TIFF appears to not really be an image format. Who knew?

Source code in ingestors/media/tiff.py

class TIFFIngestor(Ingestor, PDFSupport, TempFileSupport, ShellSupport):
    """TIFF appears to not really be an image format. Who knew?"""

    MIME_TYPES = [
        "image/tiff",
        "image/x-tiff",
    ]
    EXTENSIONS = ["tif", "tiff"]
    SCORE = 11

    def ingest(self, file_path, entity):
        entity.schema = model.get("Pages")
        pdf_path = self.make_work_file("tiff.pdf")
        try:
            self.exec_command(
                "tiff2pdf",
                file_path,
                "-n",
                "-j",
                "-x",
                "300",
                "-y",
                "300",
                "-o",
                pdf_path,
            )
        except ProcessingException:
            self.exec_command(
                "tiff2pdf", file_path, "-x", "300", "-y", "300", "-o", pdf_path
            )

        self.assert_outfile(pdf_path)

        self.pdf_alternative_extract(entity, pdf_path, self.manager)

VCardIngestor

ingestors.email.vcard.VCardIngestor

File types

text/vcard
text/x-vcard

File extensions

.vcf
.vcard

Bases: Ingestor, EncodingSupport

Source code in ingestors/email/vcard.py

class VCardIngestor(Ingestor, EncodingSupport):
    MIME_TYPES = ["text/vcard", "text/x-vcard"]
    EXTENSIONS = ["vcf", "vcard"]
    SCORE = 10

    def get_field(self, card: Component, field: str):
        items: list[ContentLine] = ensure_list(card.contents.get(field))
        return [str(i.value) for i in items]

    def ingest_card(self, entity, card):
        person = self.manager.make_entity("Person")
        person.add("proof", entity.id)
        person.add("name", self.get_field(card, "n"))
        person.add("name", self.get_field(card, "fn"))
        person.add("gender", self.get_field(card, "gender"))
        person.add("birthDate", self.get_field(card, "bday"))
        person.add("position", self.get_field(card, "title"))
        person.add("summary", self.get_field(card, "note"))
        person.add("keywords", self.get_field(card, "categories"))
        person.add("phone", self.get_field(card, "tel"))
        person.add("weakAlias", self.get_field(card, "nickname"))
        for email in self.get_field(card, "email"):
            key = email.strip().lower()
            if len(key):
                person.make_id(key)
            person.add("email", email)
        if person.id:
            self.manager.apply_context(person, entity)
            self.manager.emit_entity(person)

    def ingest(self, file_path, entity):
        entity.schema = model.get("PlainText")
        text = self.read_file_decoded(entity, file_path)
        text = sanitize_text(text)
        entity.set("bodyText", text)
        try:
            for card in vobject.readComponents(text, allowQP=True):
                self.ingest_card(entity, card)
        except (ParseError, UnicodeDecodeError) as err:
            raise ProcessingException("Cannot parse vcard: %s" % err) from err

VideoIngestor

ingestors.media.video.VideoIngestor

File types

application/x-shockwave-flash
video/quicktime
video/mp4
video/x-flv

File extensions

.avi
.mpg
.mpeg
.mkv
.mp4
.mov

Bases: Ingestor, TimestampSupport, TranscriptionSupport

Source code in ingestors/media/video.py

class VideoIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
    MIME_TYPES = [
        "application/x-shockwave-flash",
        "video/quicktime",
        "video/mp4",
        "video/x-flv",
    ]
    EXTENSIONS = [
        "avi",
        "mpg",
        "mpeg",
        "mkv",
        "mp4",
        "mov",
    ]
    SCORE = 3

    def ingest(self, file_path, entity):
        try:
            entity.schema = model.get("Video")
            log.info("[%r] flagged as video.", entity)
            metadata = MediaInfo.parse(file_path)
            for track in metadata.tracks:
                entity.add("title", track.title)
                entity.add("generator", track.writing_application)
                entity.add("generator", track.writing_library)
                entity.add("generator", track.publisher)
                entity.add("authoredAt", self.parse_timestamp(track.recorded_date))
                entity.add("authoredAt", self.parse_timestamp(track.tagged_date))
                entity.add("authoredAt", self.parse_timestamp(track.encoded_date))
                modified_at = self.parse_timestamp(track.file_last_modification_date)
                entity.add("modifiedAt", modified_at)
                entity.add("duration", track.duration)
        except Exception as ex:
            raise ProcessingException("Could not read video: %r", ex) from ex
        try:
            self.transcribe(self.manager.dataset, entity, self.manager.context)
        except Exception as ex:
            log.error(f"Could not queue audio for transcription: {ex}")

    @classmethod
    def match(cls, file_path, entity):
        score = super(VideoIngestor, cls).match(file_path, entity)
        if score <= 0:
            for mime_type in entity.get("mimeType"):
                if mime_type.startswith("video/"):
                    return cls.SCORE * 2
        return score

ExcelIngestor

ingestors.tabular.xls.ExcelIngestor

File types

application/excel
application/x-excel
application/vnd.ms-excel
application/x-msexcel

File extensions

.xls
.xlt
.xla

Bases: Ingestor, TableSupport, OLESupport

Source code in ingestors/tabular/xls.py

class ExcelIngestor(Ingestor, TableSupport, OLESupport):
    MIME_TYPES = [
        "application/excel",
        "application/x-excel",
        "application/vnd.ms-excel",
        "application/x-msexcel",
    ]
    EXTENSIONS = ["xls", "xlt", "xla"]
    SCORE = 7

    def convert_cell(self, cell, sheet):
        value = cell.value
        try:
            if cell.ctype == 3:
                if value == 0:
                    return None
                year, month, day, hour, minute, second = xlrd.xldate_as_tuple(
                    value, sheet.book.datemode
                )
                if (year, month, day) == (0, 0, 0):
                    value = time(hour, minute, second)
                    return value.isoformat()
                else:
                    return datetime(year, month, day, hour, minute, second)
        except Exception as exc:
            log.warning("Error in Excel value [%s]: %s", cell, exc)
        return value

    def generate_csv(self, sheet):
        for row_index in range(0, sheet.nrows):
            yield [self.convert_cell(c, sheet) for c in sheet.row(row_index)]

    def ingest(self, file_path, entity):
        entity.schema = model.get("Workbook")
        self.extract_ole_metadata(file_path, entity)
        try:
            book = xlrd.open_workbook(file_path, formatting_info=False)
        except XLRDError:
            raise ProcessingException(ENCRYPTED_MSG)
        except Exception as err:
            raise ProcessingException("Invalid Excel file: %s" % err) from err

        try:
            for sheet in book.sheets():
                table = self.manager.make_entity("Table", parent=entity)
                table.make_id(entity.id, sheet.name)
                table.set("title", sheet.name)
                # Emit a partial table fragment with parent reference and name
                # early, so that we don't have orphan fragments in case of an error
                # in the middle of processing.
                # See https://github.com/alephdata/ingest-file/issues/171
                self.manager.emit_entity(table, fragment="initial")
                self.emit_row_tuples(table, self.generate_csv(sheet))
                if table.has("csvHash"):
                    self.manager.emit_entity(table)
        except XLRDError as err:
            raise ProcessingException("Invalid Excel file: %s" % err) from err
        finally:
            book.release_resources()

ExcelXMLIngestor

ingestors.tabular.xlsx.ExcelXMLIngestor

File types

application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
application/vnd.openxmlformats-officedocument.spreadsheetml.template
application/vnd.ms-excel.sheet.macroenabled.12
application/vnd.ms-excel.sheet.binary.macroenabled.12
application/vnd.ms-excel.template.macroenabled.12
application/vnd.ms-excel.sheet.macroEnabled.main+xml

File extensions

.xlsx
.xlsm
.xltx
.xltm

Bases: Ingestor, TableSupport, OOXMLSupport

Source code in ingestors/tabular/xlsx.py

class ExcelXMLIngestor(Ingestor, TableSupport, OOXMLSupport):
    MIME_TYPES = [
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",  # noqa
        "application/vnd.openxmlformats-officedocument.spreadsheetml.template",  # noqa
        "application/vnd.ms-excel.sheet.macroenabled.12",
        "application/vnd.ms-excel.sheet.binary.macroenabled.12",
        "application/vnd.ms-excel.template.macroenabled.12",
        "application/vnd.ms-excel.sheet.macroEnabled.main+xml",
    ]
    EXTENSIONS = ["xlsx", "xlsm", "xltx", "xltm"]
    SCORE = 7

    def generate_rows(self, sheet):
        for row in sheet.rows:
            try:
                yield [c.value for c in row]
            except (ValueError, OverflowError, ParseError) as ve:
                log.warning("Failed to read Excel row: %s", ve)

    def ingest(self, file_path, entity):
        entity.schema = model.get("Workbook")
        self.ooxml_extract_metadata(file_path, entity)
        try:
            book = load_workbook(file_path, read_only=True)
        except Exception as err:
            raise ProcessingException("Invalid Excel file: %s" % err) from err

        try:
            for name in book.sheetnames:
                sheet = book[name]
                if not hasattr(sheet, "rows"):
                    log.warning("Cannot parse chart sheet: %s", name)
                    continue
                table = self.manager.make_entity("Table", parent=entity)
                table.make_id(entity.id, name)
                table.set("title", name)
                # Emit a partial table fragment with parent reference and name
                # early, so that we don't have orphan fragments in case of an error
                # in the middle of processing.
                # See https://github.com/alephdata/ingest-file/issues/171
                self.manager.emit_entity(table, fragment="initial")
                log.debug("Sheet: %s", name)
                self.emit_row_tuples(table, self.generate_rows(sheet))
                if table.has("csvHash"):
                    self.manager.emit_entity(table)
        except Exception as err:
            raise ProcessingException("Cannot read Excel file: %s" % err) from err
        finally:
            book.close()

    @classmethod
    def match(cls, file_path, entity):
        score = super(ExcelXMLIngestor, cls).match(file_path, entity)
        if score > 0 and not cls.inspect_ooxml_manifest(file_path):
            return -1
        return score

XMLIngestor

ingestors.documents.xml.XMLIngestor

XML file ingestor class. Generates a tabular HTML representation.

File types

text/xml

File extensions

.xml

Bases: Ingestor, EncodingSupport, XMLSupport, HTMLSupport

XML file ingestor class. Generates a tabular HTML representation.

Source code in ingestors/documents/xml.py

class XMLIngestor(Ingestor, EncodingSupport, XMLSupport, HTMLSupport):
    "XML file ingestor class. Generates a tabular HTML representation."

    MIME_TYPES = ["text/xml"]
    EXTENSIONS = ["xml"]
    SCORE = 1
    MAX_SIZE = 4 * 1024 * 1024
    XSLT = etree.XML(
        b"""<?xml version="1.0" encoding="UTF-8"?>
        <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
            version="1.0">
        <xsl:output omit-xml-declaration="yes" indent="yes"/>
        <xsl:strip-space elements="*"/>

        <xsl:template match="/">
            <table>
            <xsl:apply-templates/>
            </table>
        </xsl:template>

        <xsl:template match="*">
            <tr>
            <td>
                <p><xsl:value-of select="name()"/></p>
            </td>
            <td>
                <p><xsl:value-of select="."/></p>
            </td>
            </tr>
        </xsl:template>

        <xsl:template match="*[*]">
            <tr>
            <td>
                <p><xsl:value-of select="name()"/></p>
            </td>
            <td>
                <table>
                <xsl:apply-templates/>
                </table>
            </td>
            </tr>
        </xsl:template>

        </xsl:stylesheet>"""
    )

    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        entity.schema = model.get("HyperText")
        for file_size in entity.get("fileSize"):
            if int(file_size) > self.MAX_SIZE:
                raise ProcessingException("XML file is too large.")

        doc = self.parse_xml_path(file_path)
        text = self.extract_html_text(doc.getroot())
        entity.set("bodyText", text)
        try:
            transform = etree.XSLT(self.XSLT)
            html_doc = transform(doc)
            html_body = html.tostring(html_doc, encoding=str, pretty_print=True)
            entity.set("bodyHtml", html_body)
        except ValueError as ve:
            raise ProcessingException("Error converting XML file: %s" % ve) from ve

`ingest(file_path, entity)`

Ingestor implementation.

Source code in ingestors/documents/xml.py

def ingest(self, file_path, entity):
    """Ingestor implementation."""
    entity.schema = model.get("HyperText")
    for file_size in entity.get("fileSize"):
        if int(file_size) > self.MAX_SIZE:
            raise ProcessingException("XML file is too large.")

    doc = self.parse_xml_path(file_path)
    text = self.extract_html_text(doc.getroot())
    entity.set("bodyText", text)
    try:
        transform = etree.XSLT(self.XSLT)
        html_doc = transform(doc)
        html_body = html.tostring(html_doc, encoding=str, pretty_print=True)
        entity.set("bodyHtml", html_body)
    except ValueError as ve:
        raise ProcessingException("Error converting XML file: %s" % ve) from ve

ZipIngestor

ingestors.packages.zip.ZipIngestor

File types

application/zip
application/x-zip
multipart/x-zip
application/zip-compressed
application/x-zip-compressed

File extensions

.zip

Bases: PackageSupport, Ingestor

Source code in ingestors/packages/zip.py

class ZipIngestor(PackageSupport, Ingestor):
    MIME_TYPES = [
        "application/zip",
        "application/x-zip",
        "multipart/x-zip",
        "application/zip-compressed",
        "application/x-zip-compressed",
    ]
    EXTENSIONS = ["zip"]
    SCORE = 3

    def unpack(self, file_path, entity, temp_dir):
        try:
            with zipfile.ZipFile(file_path) as zf:
                names = zf.namelist()
                encoding = self.detect_list_encoding(names)
                log.debug("Detected filename encoding: %s", encoding)
                for name in names:
                    try:
                        info = zf.getinfo(name)
                        if info.is_dir():
                            continue

                        with zf.open(name) as fh:
                            self.extract_member(temp_dir, name, fh, encoding=encoding)
                    except Exception as ex:
                        # TODO: should this be a fatal error?
                        log.debug("Failed to unpack [%r]: %s", name, ex)
        except (zipfile.BadZipfile, UnicodeDecodeError, OSError) as bzfe:
            raise ProcessingException("Invalid ZIP file: %s" % bzfe) from bzfe

    @classmethod
    def match(cls, file_path, entity):
        if zipfile.is_zipfile(file_path):
            return cls.SCORE
        return super(ZipIngestor, cls).match(file_path, entity)