Module refinery.lib.mime

File type related functions.

Expand source code Browse git
"""
File type related functions.
"""
from __future__ import annotations

import functools

from refinery.lib.id import get_pe_type, get_structured_data_type
from refinery.lib.magic import magic, magicparse

FileTypeMap = {
    'application/x-setupscript': 'ini',
    'applicaiton/x-bytecode.python': 'pyc',
    'application/acad': 'dwg',
    'application/arj': 'arj',
    'application/book': 'book',
    'application/dos-exe': 'exe',
    'application/drafting': 'drw',
    'application/dxf': 'dxf',
    'application/ecmascript': 'js',
    'application/epub+zip': 'epub',
    'application/excel': 'xls',
    'application/exe': 'exe',
    'application/gnutar': 'tgz',
    'application/gzip': 'gz',
    'application/hlp': 'hlp',
    'application/inf': 'inf',
    'application/java-archive': 'jar',
    'application/java-byte-code': 'class',
    'application/java': 'class',
    'application/javascript': 'js',
    'application/json': 'json',
    'application/ld+json': 'jsonld',
    'application/lha': 'lha',
    'application/lzx': 'lzx',
    'application/mac-binary': 'bin',
    'application/mac-compactpro': 'cpt',
    'application/macbinary': 'bin',
    'application/mime': 'aps',
    'application/msdos-windows': 'exe',
    'application/mspowerpoint': 'ppt',
    'application/msword': 'doc',
    'application/ogg': 'ogg',
    'application/pdf': 'pdf',
    'application/plain': 'txt',
    'application/postscript': 'ps',
    'application/powerpoint': 'ppt',
    'application/rtf': 'rtf',
    'application/vnd.amazon.ebook': 'azw',
    'application/vnd.apple.installer+xml': 'mpkg',
    'application/vnd.hp-pcl': 'pcl',
    'application/vnd.lotus-1-2-3': '123',
    'application/vnd.microsoft.portable-executable': 'exe',
    'application/vnd.mozilla.xul+xml': 'xul',
    'application/vnd.ms-cab-compressed': 'cab',
    'application/vnd.ms-excel': 'xls',
    'application/vnd.ms-fontobject': 'eot',
    'application/vnd.ms-outlook': 'eml',
    'application/vnd.ms-powerpoint': 'ppt',
    'application/vnd.oasis.opendocument.presentation': 'odp',
    'application/vnd.oasis.opendocument.spreadsheet': 'ods',
    'application/vnd.oasis.opendocument.text': 'odt',
    'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
    'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
    'application/vnd.rar': 'rar',
    'application/vnd.rn-realmedia': 'rm',
    'application/vnd.visio': 'vsd',
    'application/vocaltec-media-desc': 'vmd',
    'application/x-7z-compressed': '7z',
    'application/x-abiword': 'abw',
    'application/x-binary': 'bin',
    'application/x-bsh': 'sh',
    'application/x-bzip': 'bz',
    'application/x-bzip2': 'bz2',
    'application/x-cdlink': 'vcd',
    'application/x-compactpro': 'cpt',
    'application/x-compress': 'z',
    'application/x-compressed': 'gz',
    'application/x-cpt': 'cpt',
    'application/x-csh': 'csh',
    'application/x-dosexec': 'exe',
    'application/x-dvi': 'dvi',
    'application/x-elf-executable': 'elf',
    'application/x-excel': 'xls',
    'application/x-exe': 'exe',
    'application/x-executable': 'elf',
    'application/x-freearc': 'arc',
    'application/x-gzip': 'gz',
    'application/x-helpfile': 'hlp',
    'application/x-httpd-php': 'php',
    'application/x-java-class': 'class',
    'application/x-java-commerce': 'jcm',
    'application/x-javascript': 'js',
    'application/x-latex': 'latex',
    'application/x-lha': 'lha',
    'application/x-lisp': 'lsp',
    'application/x-lzh': 'lzh',
    'application/x-lzx': 'lzx',
    'application/x-macbinary': 'bin',
    'application/x-midi': 'mid',
    'application/x-mplayer2': 'asx',
    'application/x-msdos-program': 'exe',
    'application/x-msdownload': 'exe',
    'application/x-msexcel': 'xls',
    'application/x-msi': 'msi',
    'application/x-mspowerpoint': 'ppt',
    'application/x-navi-animation': 'ani',
    'application/x-pcl': 'pcl',
    'application/x-pointplus': 'css',
    'application/x-rtf': 'rtf',
    'application/x-sh': 'sh',
    'application/x-shar': 'sh',
    'application/x-shockwave-flash': 'swf',
    'application/x-tar': 'tar',
    'application/x-tcl': 'tcl',
    'application/x-troff-man': 'man',
    'application/x-troff-msvideo': 'avi',
    'application/x-visio': 'vsd',
    'application/x-vrml': 'vrml',
    'application/x-winexe': 'exe',
    'application/x-winhelp': 'hlp',
    'application/x-zip-compressed': 'zip',
    'application/x-zoo': 'zoo',
    'application/xhtml+xml': 'xhtml',
    'application/xml': 'xml',
    'application/zip': 'zip',
    'audio/aac': 'aac',
    'audio/aiff': 'aiff',
    'audio/basic': 'au',
    'audio/midi': 'mid',
    'audio/mod': 'mod',
    'audio/mpeg': 'mpg',
    'audio/mpeg3': 'mp3',
    'audio/ogg': 'ogg',
    'audio/opus': 'opus',
    'audio/wav': 'wav',
    'audio/webm': 'webm',
    'audio/x-aiff': 'aiff',
    'audio/x-au': 'au',
    'audio/x-jam': 'jam',
    'audio/x-mid': 'mid',
    'audio/x-midi': 'mid',
    'audio/x-mod': 'mod',
    'audio/x-mpeg-3': 'mp3',
    'audio/x-mpeg': 'mp2',
    'audio/x-mpequrl': 'm3u',
    'audio/x-pn-realaudio-plugin': 'ra',
    'audio/x-pn-realaudio': 'rm',
    'audio/x-realaudio': 'ra',
    'audio/x-wav': 'wav',
    'audio/xm': 'xm',
    'font/otf': 'otf',
    'font/ttf': 'ttf',
    'font/woff': 'woff',
    'font/woff2': 'woff2',
    'image/bmp': 'bmp',
    'image/gif': 'gif',
    'image/jpeg': 'jpg',
    'image/pict': 'pic',
    'image/pjpeg': 'jpg',
    'image/png': 'png',
    'image/svg+xml': 'svg',
    'image/tiff': 'tif',
    'image/vnd.dwg': 'dwg',
    'image/vnd.microsoft.icon': 'ico',
    'image/webp': 'webp',
    'image/x-3ds': '3ds',
    'image/x-dwg': 'dwg',
    'image/x-icon': 'ico',
    'image/x-jg': 'art',
    'image/x-jps': 'jps',
    'image/x-pcx': 'pcx',
    'image/x-pict': 'pct',
    'image/x-quicktime': 'qtif',
    'image/x-tiff': 'tif',
    'image/x-windows-bmp': 'bmp',
    'image/x-ms-bmp': 'bmp',
    'image/x-xpixmap': 'pm',
    'model/vrml': 'vrml',
    'multipart/x-gzip': 'gz',
    'multipart/x-zip': 'zip',
    'music/crescendo': 'mid',
    'text/asp': 'asp',
    'text/calendar': 'ics',
    'text/css': 'css',
    'text/csv': 'csv',
    'text/ecmascript': 'js',
    'text/html': 'html',
    'text/javascript': 'js',
    'text/pascal': 'pas',
    'text/plain': 'txt',
    'text/richtext': 'rtf',
    'text/sgml': 'sgml',
    'text/uri-list': 'uri',
    'text/webviewhtml': 'htt',
    'text/x-asm': 'asm',
    'text/x-c': 'c',
    'text/x-component': 'htc',
    'text/x-h': 'h',
    'text/x-java-source': 'java',
    'text/x-script.lisp': 'lsp',
    'text/x-script.perl-module': 'pm',
    'text/x-script.perl': 'pl',
    'text/x-script.phyton': 'py',
    'text/x-script.sh': 'sh',
    'text/x-script.tcl': 'tcl',
    'text/x-scriptzsh': 'zsh',
    'text/x-server-parsed-html': 'shtml',
    'text/x-sgml': 'sgml',
    'text/xml': 'xml',
    'video/3gpp': '3gp',
    'video/3gpp2': '3g2',
    'video/avi': 'avi',
    'video/dl': 'dl',
    'video/mp2t': 'ts',
    'video/mp4': 'mp4',
    'video/mpeg': 'mpeg',
    'video/msvideo': 'avi',
    'video/ogg': 'ogg',
    'video/quicktime': 'mov',
    'video/vnd.rn-realvideo': 'rv',
    'video/webm': 'webm',
    'video/x-dl': 'dl',
    'video/x-dv': 'dif',
    'video/x-mpeg': 'mp4',
    'video/x-mpeq2a': 'mp2',
    'video/x-ms-asf-plugin': 'asx',
    'video/x-ms-asf': 'asf',
    'video/x-msvideo': 'avi',
    'video/x-sgi-movie': 'movie',
    'vms/exe': 'exe',
    'windows/metafile': 'wmf',
    'x-conference/x-cooltalk': 'ice',
    'x-music/x-midi': 'mid',
    'x-world/x-3dmf': '3dmf',
    'x-world/x-vrml': 'vrml',
}


def file_extension(mime, default='bin'):
    return FileTypeMap.get(mime, default)


class FileMagicInfo:
    extension: str
    description: str
    mime: str
    blob: bool

    _GZIP_PEEK_MAXIMUM = 1024
    _GZIP_PEEK_MINIMUM = 64
    _GZIP_DC_CHUNK_LEN = 16

    def __init__(self, data, default='bin', decompress=True):
        extension = default
        description = 'data'
        mime = 'application/octet-stream'
        blob = True

        if magic is not None:
            if not isinstance(data, bytes):
                data = bytes(data)
            mime = magicparse(data, mime=True)
            mime = mime.split(';')[0].lower()
            extension = FileTypeMap.get(mime, default)
            description = magicparse(data).strip()
            if description == 'Microsoft OOXML':
                extension = 'docx'
            elif extension == 'exe' and (t := get_pe_type(data)):
                extension = t.extension
            blob = description.lower() == 'data'

        if blob and (check := get_structured_data_type(data)):
            extension = check.extension
            description = check.details
            mime = check.mime
            blob = False

        if extension in ('gz', 'gzip', 'bz2') and decompress:
            if extension == 'bz2':
                import bz2
                dc = bz2.BZ2Decompressor()
            else:
                import zlib
                dc = zlib.decompressobj(0x10)
            mv = memoryview(data)
            cursor = 0
            buffer = bytearray()
            while cursor < len(mv):
                end = cursor + self._GZIP_DC_CHUNK_LEN
                try:
                    buffer.extend(dc.decompress(mv[cursor:end]))
                except Exception:
                    break
                else:
                    cursor = end
                if len(buffer) >= self._GZIP_PEEK_MAXIMUM:
                    break
            if len(buffer) > self._GZIP_PEEK_MINIMUM:
                try:
                    inner = FileMagicInfo(buffer, default).extension
                except Exception:
                    pass
                else:
                    extension = F'{inner}.{extension}'

        self.extension = extension
        self.description = description
        self.mime = mime
        self.blob = blob


@functools.lru_cache(maxsize=None)
def get_cached_file_magic_info(data):
    return FileMagicInfo(data)

Functions

def file_extension(mime, default='bin')
Expand source code Browse git
def file_extension(mime, default='bin'):
    return FileTypeMap.get(mime, default)
def get_cached_file_magic_info(data)
Expand source code Browse git
@functools.lru_cache(maxsize=None)
def get_cached_file_magic_info(data):
    return FileMagicInfo(data)

Classes

class FileMagicInfo (data, default='bin', decompress=True)
Expand source code Browse git
class FileMagicInfo:
    extension: str
    description: str
    mime: str
    blob: bool

    _GZIP_PEEK_MAXIMUM = 1024
    _GZIP_PEEK_MINIMUM = 64
    _GZIP_DC_CHUNK_LEN = 16

    def __init__(self, data, default='bin', decompress=True):
        extension = default
        description = 'data'
        mime = 'application/octet-stream'
        blob = True

        if magic is not None:
            if not isinstance(data, bytes):
                data = bytes(data)
            mime = magicparse(data, mime=True)
            mime = mime.split(';')[0].lower()
            extension = FileTypeMap.get(mime, default)
            description = magicparse(data).strip()
            if description == 'Microsoft OOXML':
                extension = 'docx'
            elif extension == 'exe' and (t := get_pe_type(data)):
                extension = t.extension
            blob = description.lower() == 'data'

        if blob and (check := get_structured_data_type(data)):
            extension = check.extension
            description = check.details
            mime = check.mime
            blob = False

        if extension in ('gz', 'gzip', 'bz2') and decompress:
            if extension == 'bz2':
                import bz2
                dc = bz2.BZ2Decompressor()
            else:
                import zlib
                dc = zlib.decompressobj(0x10)
            mv = memoryview(data)
            cursor = 0
            buffer = bytearray()
            while cursor < len(mv):
                end = cursor + self._GZIP_DC_CHUNK_LEN
                try:
                    buffer.extend(dc.decompress(mv[cursor:end]))
                except Exception:
                    break
                else:
                    cursor = end
                if len(buffer) >= self._GZIP_PEEK_MAXIMUM:
                    break
            if len(buffer) > self._GZIP_PEEK_MINIMUM:
                try:
                    inner = FileMagicInfo(buffer, default).extension
                except Exception:
                    pass
                else:
                    extension = F'{inner}.{extension}'

        self.extension = extension
        self.description = description
        self.mime = mime
        self.blob = blob

Class variables

var extension

The type of the None singleton.

var description

The type of the None singleton.

var mime

The type of the None singleton.

var blob

The type of the None singleton.