Module refinery.units.strings.ngrams

Expand source code Browse git
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from refinery.units import Arg, Unit
from refinery.lib.tools import integers_of_slice


class ngrams(Unit):
    """
    Extract all n-grams from the input. The algorithm is naive, i.e. it simply iterates all n-grams
    and deduplicates using a set data structure. The number n is taken from an arbitrary range given
    as a Python slice expression.
    """
    def __init__(
        self, size: Arg.Bounds(
            help='Specifies the sizes of each n-gram, i.e. the number n. Defaults to {default}.') = slice(2, None),
    ):
        super().__init__(size=size)

    def process(self, data: bytearray):
        for n in integers_of_slice(self.args.size):
            self.log_info(F'emitting {n}-grams')
            if n > len(data):
                break
            deduplicator = set()
            view = memoryview(data)
            for index in range(len(data) - n + 1):
                block = bytes(view[index:index + n])
                if block in deduplicator:
                    continue
                deduplicator.add(block)
                yield self.labelled(block, offset=index)

Classes

class ngrams (size=slice(2, None, None))

Extract all n-grams from the input. The algorithm is naive, i.e. it simply iterates all n-grams and deduplicates using a set data structure. The number n is taken from an arbitrary range given as a Python slice expression.

Expand source code Browse git
class ngrams(Unit):
    """
    Extract all n-grams from the input. The algorithm is naive, i.e. it simply iterates all n-grams
    and deduplicates using a set data structure. The number n is taken from an arbitrary range given
    as a Python slice expression.
    """
    def __init__(
        self, size: Arg.Bounds(
            help='Specifies the sizes of each n-gram, i.e. the number n. Defaults to {default}.') = slice(2, None),
    ):
        super().__init__(size=size)

    def process(self, data: bytearray):
        for n in integers_of_slice(self.args.size):
            self.log_info(F'emitting {n}-grams')
            if n > len(data):
                break
            deduplicator = set()
            view = memoryview(data)
            for index in range(len(data) - n + 1):
                block = bytes(view[index:index + n])
                if block in deduplicator:
                    continue
                deduplicator.add(block)
                yield self.labelled(block, offset=index)

Ancestors

Class variables

var required_dependencies
var optional_dependencies

Inherited members