Module refinery.units.formats.json

Expand source code Browse git
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import annotations
from typing import Union, Optional

import json

from refinery.units.formats import PathExtractorUnit, UnpackResult, Unit
from refinery.lib.meta import is_valid_variable_name
from refinery.lib.patterns import checks

class xtjson(PathExtractorUnit):
    Extract values from a JSON document.
    _custom_path_separator = '.'

    def unpack(self, data):

        def crawl(path, cursor):
            if isinstance(cursor, dict):
                for key, value in cursor.items():
                    yield from crawl(F'{path}/{key}', value)
            elif isinstance(cursor, list):
                for key, value in enumerate(cursor):
                    yield from crawl(F'{path}/{key:d}', value)
            if path:
                yield path, cursor, cursor.__class__.__name__

        for path, item, typename in crawl('', json.loads(data)):
            def extract(item=item):
                if isinstance(item, (list, dict)):
                    dumped = json.dumps(item, indent=4)
                    dumped = str(item)
                return dumped.encode(self.codec)
            yield UnpackResult(path, extract, type=typename)

    def handles(self, data: bytearray) -> Optional[bool]:
        return bool(checks.json.fullmatch(data))

class xj0(Unit):
    Extracts a single field from a JSON document at depth 0. By default, the unit applies a heuristic to
    extract remaining fields as metadata: String values are extracted only if they do not exceed 80
    characters in length and do not contain any line breaks. Floating-point, integer, boolean values, and
    lists of the latter are also extracted.
    def __init__(
        key: Unit.Arg.Binary(help='Optional key of a value to become the main body of the chunk.') = None,
        raw: Unit.Arg.Switch('-r', group='META', help='Do not extract any other fields as metadata.') = False,
        all: Unit.Arg.Switch('-a', group='META', help='Extract all other fields as metadata.') = False
        super().__init__(key=key, raw=raw, all=all)

    def process(self, data):

        def acceptable(key, value, inside_list=False):
            if not is_valid_variable_name(key):
                return False
            if isinstance(value, dict):
                return False
            if isinstance(value, (float, int, bool)):
                return True
            if inside_list:
                return False
            if isinstance(value, list):
                return all(acceptable(key, t, True) for t in value)
            if isinstance(value, str):
                if self.args.all:
                    return True
                return len(value) in range(1, 80) and '\n' not in value

        doc: dict = json.loads(data)
        if not isinstance(doc, dict):
            raise ValueError('The input must be a JSON dictionary.')
        key = self.args.key
        result = key and doc.pop(key.decode(self.codec), '').encode(self.codec)
        if self.args.raw:
            return result
            return self.labelled(result, **{
                key: value for key, value in doc.items() if acceptable(key, value)

class xjl(Unit):
    Returns all JSON elements from a JSON iterable as individual outputs.

    def process(self, data):
            doc: Union[list, dict] = json.loads(data)
        except Exception:
            from refinery.units.pattern.carve_json import carve_json
            doc = data | carve_json | json.loads
            it = doc.values()
        except AttributeError:
            it = doc
        for item in it:
            yield json.dumps(item, indent=4).encode(self.codec)


class xtjson (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract values from a JSON document.

Expand source code Browse git
class xtjson(PathExtractorUnit):
    Extract values from a JSON document.
    _custom_path_separator = '.'

    def unpack(self, data):

        def crawl(path, cursor):
            if isinstance(cursor, dict):
                for key, value in cursor.items():
                    yield from crawl(F'{path}/{key}', value)
            elif isinstance(cursor, list):
                for key, value in enumerate(cursor):
                    yield from crawl(F'{path}/{key:d}', value)
            if path:
                yield path, cursor, cursor.__class__.__name__

        for path, item, typename in crawl('', json.loads(data)):
            def extract(item=item):
                if isinstance(item, (list, dict)):
                    dumped = json.dumps(item, indent=4)
                    dumped = str(item)
                return dumped.encode(self.codec)
            yield UnpackResult(path, extract, type=typename)

    def handles(self, data: bytearray) -> Optional[bool]:
        return bool(checks.json.fullmatch(data))


Class variables

var required_dependencies
var optional_dependencies


def unpack(self, data)
Expand source code Browse git
def unpack(self, data):

    def crawl(path, cursor):
        if isinstance(cursor, dict):
            for key, value in cursor.items():
                yield from crawl(F'{path}/{key}', value)
        elif isinstance(cursor, list):
            for key, value in enumerate(cursor):
                yield from crawl(F'{path}/{key:d}', value)
        if path:
            yield path, cursor, cursor.__class__.__name__

    for path, item, typename in crawl('', json.loads(data)):
        def extract(item=item):
            if isinstance(item, (list, dict)):
                dumped = json.dumps(item, indent=4)
                dumped = str(item)
            return dumped.encode(self.codec)
        yield UnpackResult(path, extract, type=typename)

Inherited members

class xj0 (key=None, raw=False, all=False)

Extracts a single field from a JSON document at depth 0. By default, the unit applies a heuristic to extract remaining fields as metadata: String values are extracted only if they do not exceed 80 characters in length and do not contain any line breaks. Floating-point, integer, boolean values, and lists of the latter are also extracted.

Expand source code Browse git
class xj0(Unit):
    Extracts a single field from a JSON document at depth 0. By default, the unit applies a heuristic to
    extract remaining fields as metadata: String values are extracted only if they do not exceed 80
    characters in length and do not contain any line breaks. Floating-point, integer, boolean values, and
    lists of the latter are also extracted.
    def __init__(
        key: Unit.Arg.Binary(help='Optional key of a value to become the main body of the chunk.') = None,
        raw: Unit.Arg.Switch('-r', group='META', help='Do not extract any other fields as metadata.') = False,
        all: Unit.Arg.Switch('-a', group='META', help='Extract all other fields as metadata.') = False
        super().__init__(key=key, raw=raw, all=all)

    def process(self, data):

        def acceptable(key, value, inside_list=False):
            if not is_valid_variable_name(key):
                return False
            if isinstance(value, dict):
                return False
            if isinstance(value, (float, int, bool)):
                return True
            if inside_list:
                return False
            if isinstance(value, list):
                return all(acceptable(key, t, True) for t in value)
            if isinstance(value, str):
                if self.args.all:
                    return True
                return len(value) in range(1, 80) and '\n' not in value

        doc: dict = json.loads(data)
        if not isinstance(doc, dict):
            raise ValueError('The input must be a JSON dictionary.')
        key = self.args.key
        result = key and doc.pop(key.decode(self.codec), '').encode(self.codec)
        if self.args.raw:
            return result
            return self.labelled(result, **{
                key: value for key, value in doc.items() if acceptable(key, value)


Class variables

var required_dependencies
var optional_dependencies

Inherited members

class xjl

Returns all JSON elements from a JSON iterable as individual outputs.

Expand source code Browse git
class xjl(Unit):
    Returns all JSON elements from a JSON iterable as individual outputs.

    def process(self, data):
            doc: Union[list, dict] = json.loads(data)
        except Exception:
            from refinery.units.pattern.carve_json import carve_json
            doc = data | carve_json | json.loads
            it = doc.values()
        except AttributeError:
            it = doc
        for item in it:
            yield json.dumps(item, indent=4).encode(self.codec)


Class variables

var required_dependencies
var optional_dependencies

Inherited members