Module refinery.lib.suffixtree
This module contains an implementation of Ukkonen's suffix tree algorithm.
Expand source code Browse git
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This module contains an implementation of Ukkonen's suffix tree algorithm.
"""
from __future__ import annotations
from abc import ABCMeta
from io import BytesIO
from typing import Any, ByteString, Iterable, List, Dict, Optional
from collections import deque
class NodeMeta(ABCMeta):
def __new__(mcls, name, bases, namespace: Dict[str, Any]):
namespace.setdefault('__slots__', tuple(namespace.get('__annotations__', ())))
return ABCMeta.__new__(mcls, name, bases, namespace)
class Node(metaclass=NodeMeta):
children: Dict[int, Node]
end: int
link: Optional[Node]
start: int
tree: SuffixTree
def __init__(self, tree: SuffixTree):
self.tree = tree
self.link = None
self.children = {}
@property
def label(self) -> ByteString:
return self.tree.data[self.start:self.end + 1]
@property
def rootpath(self) -> Iterable[Node]:
if self.link is None:
return
yield from self.link.rootpath
yield self
def visualize(self, depth=0, **kwargs):
r = repr(self)
print(r.rjust(len(r) + depth * 2), **kwargs)
for child in self.children.values():
child.visualize(depth + 1, **kwargs)
@property
def depth(self) -> int:
if self.link is None:
return 0
return 1 + self.link.depth
@property
def suffix(self) -> bytes:
with BytesIO() as stream:
for node in self.rootpath:
stream.write(node.label)
return stream.getvalue()
def fixlinks(self):
queue = deque([self])
while queue:
node = queue.pop()
for child in node.children.values():
queue.appendleft(child)
child.link = node
def __repr__(self) -> str:
label = bytes(self.label).decode('utf8', errors='backslashreplace')
if label: label = F': {label}'
return F'<{self.__class__.__name__}{label}>'
def __iter__(self) -> Iterable[Node]:
yield from self.children.values()
class Link(Node):
def __init__(self, tree: SuffixTree, start=None, end=None):
Node.__init__(self, tree)
self.start = start
self.link = tree.root
self.end = end
class Leaf(Node):
def __init__(self, tree: SuffixTree, start=None):
Node.__init__(self, tree)
self.start = start
self.link = tree.root
@property
def end(self): return self.tree.cursor
class Root(Node):
def __init__(self, tree: SuffixTree):
Node.__init__(self, tree)
self.start = self.end = -1
class SuffixTree:
root: Root
data: ByteString
cursor: int
def __init__(self, data: ByteString):
self.data = memoryview(data)
self.root = Root(self)
self.half = None
self.node = self.root
self.end = None
self.suffix_left = 0
self.length_left = 0
self.leaves: List[Leaf] = []
for self.cursor in range(len(self.data)):
self.extend()
del self.suffix_left
del self.length_left
del self.end
del self.node
del self.half
self.root.fixlinks()
def __iter__(self) -> Iterable[Node]:
yield from self.root.children.values()
def traversable(self, node):
length = node.end - node.start + 1
if self.length_left < length:
return False
self.node = node
self.end += length
self.length_left -= length
return True
def sprout(self) -> Leaf:
leaf = Leaf(self, self.cursor)
self.leaves.append(leaf)
return leaf
def extend(self):
self.suffix_left += 1
self.half = None
while self.suffix_left > 0:
if not self.length_left:
self.end = self.cursor
bridge = self.node.children.get(self.data[self.end])
if bridge:
if self.traversable(bridge):
continue
if self.data[bridge.start + self.length_left] == self.data[self.cursor]:
if self.half is not None and self.node != self.root:
self.half.link = self.node
self.half = None
self.length_left += 1
break
split = Link(self, bridge.start, bridge.start + self.length_left - 1)
self.node.children[self.data[self.end]] = split
split.children[self.data[self.cursor]] = self.sprout()
bridge.start += self.length_left
split.children[self.data[bridge.start]] = bridge
if self.half:
self.half.link = split
self.half = split
else:
self.node.children[self.data[self.end]] = self.sprout()
if self.half:
self.half.link = self.node
self.half = None
self.suffix_left -= 1
if self.node is not self.root:
self.node = self.node.link
elif self.length_left:
self.length_left -= 1
self.end = self.cursor - self.suffix_left + 1
Classes
class NodeMeta (*args, **kwargs)
-
Metaclass for defining Abstract Base Classes (ABCs).
Use this metaclass to create an ABC. An ABC can be subclassed directly, and then acts as a mix-in class. You can also register unrelated concrete classes (even built-in classes) and unrelated ABCs as 'virtual subclasses' – these and their descendants will be considered subclasses of the registering ABC by the built-in issubclass() function, but the registering ABC won't show up in their MRO (Method Resolution Order) nor will method implementations defined by the registering ABC be callable (not even via super()).
Expand source code Browse git
class NodeMeta(ABCMeta): def __new__(mcls, name, bases, namespace: Dict[str, Any]): namespace.setdefault('__slots__', tuple(namespace.get('__annotations__', ()))) return ABCMeta.__new__(mcls, name, bases, namespace)
Ancestors
- abc.ABCMeta
- builtins.type
class Node (tree)
-
Expand source code Browse git
class Node(metaclass=NodeMeta): children: Dict[int, Node] end: int link: Optional[Node] start: int tree: SuffixTree def __init__(self, tree: SuffixTree): self.tree = tree self.link = None self.children = {} @property def label(self) -> ByteString: return self.tree.data[self.start:self.end + 1] @property def rootpath(self) -> Iterable[Node]: if self.link is None: return yield from self.link.rootpath yield self def visualize(self, depth=0, **kwargs): r = repr(self) print(r.rjust(len(r) + depth * 2), **kwargs) for child in self.children.values(): child.visualize(depth + 1, **kwargs) @property def depth(self) -> int: if self.link is None: return 0 return 1 + self.link.depth @property def suffix(self) -> bytes: with BytesIO() as stream: for node in self.rootpath: stream.write(node.label) return stream.getvalue() def fixlinks(self): queue = deque([self]) while queue: node = queue.pop() for child in node.children.values(): queue.appendleft(child) child.link = node def __repr__(self) -> str: label = bytes(self.label).decode('utf8', errors='backslashreplace') if label: label = F': {label}' return F'<{self.__class__.__name__}{label}>' def __iter__(self) -> Iterable[Node]: yield from self.children.values()
Subclasses
Instance variables
var label
-
Expand source code Browse git
@property def label(self) -> ByteString: return self.tree.data[self.start:self.end + 1]
var rootpath
-
Expand source code Browse git
@property def rootpath(self) -> Iterable[Node]: if self.link is None: return yield from self.link.rootpath yield self
var depth
-
Expand source code Browse git
@property def depth(self) -> int: if self.link is None: return 0 return 1 + self.link.depth
var suffix
-
Expand source code Browse git
@property def suffix(self) -> bytes: with BytesIO() as stream: for node in self.rootpath: stream.write(node.label) return stream.getvalue()
var children
-
Return an attribute of instance, which is of type owner.
var end
-
Return an attribute of instance, which is of type owner.
var link
-
Return an attribute of instance, which is of type owner.
var start
-
Return an attribute of instance, which is of type owner.
var tree
-
Return an attribute of instance, which is of type owner.
Methods
def visualize(self, depth=0, **kwargs)
-
Expand source code Browse git
def visualize(self, depth=0, **kwargs): r = repr(self) print(r.rjust(len(r) + depth * 2), **kwargs) for child in self.children.values(): child.visualize(depth + 1, **kwargs)
def fixlinks(self)
-
Expand source code Browse git
def fixlinks(self): queue = deque([self]) while queue: node = queue.pop() for child in node.children.values(): queue.appendleft(child) child.link = node
class Link (tree, start=None, end=None)
-
Expand source code Browse git
class Link(Node): def __init__(self, tree: SuffixTree, start=None, end=None): Node.__init__(self, tree) self.start = start self.link = tree.root self.end = end
Ancestors
Inherited members
class Leaf (tree, start=None)
-
Expand source code Browse git
class Leaf(Node): def __init__(self, tree: SuffixTree, start=None): Node.__init__(self, tree) self.start = start self.link = tree.root @property def end(self): return self.tree.cursor
Ancestors
Inherited members
class Root (tree)
-
Expand source code Browse git
class Root(Node): def __init__(self, tree: SuffixTree): Node.__init__(self, tree) self.start = self.end = -1
Ancestors
Inherited members
class SuffixTree (data)
-
Expand source code Browse git
class SuffixTree: root: Root data: ByteString cursor: int def __init__(self, data: ByteString): self.data = memoryview(data) self.root = Root(self) self.half = None self.node = self.root self.end = None self.suffix_left = 0 self.length_left = 0 self.leaves: List[Leaf] = [] for self.cursor in range(len(self.data)): self.extend() del self.suffix_left del self.length_left del self.end del self.node del self.half self.root.fixlinks() def __iter__(self) -> Iterable[Node]: yield from self.root.children.values() def traversable(self, node): length = node.end - node.start + 1 if self.length_left < length: return False self.node = node self.end += length self.length_left -= length return True def sprout(self) -> Leaf: leaf = Leaf(self, self.cursor) self.leaves.append(leaf) return leaf def extend(self): self.suffix_left += 1 self.half = None while self.suffix_left > 0: if not self.length_left: self.end = self.cursor bridge = self.node.children.get(self.data[self.end]) if bridge: if self.traversable(bridge): continue if self.data[bridge.start + self.length_left] == self.data[self.cursor]: if self.half is not None and self.node != self.root: self.half.link = self.node self.half = None self.length_left += 1 break split = Link(self, bridge.start, bridge.start + self.length_left - 1) self.node.children[self.data[self.end]] = split split.children[self.data[self.cursor]] = self.sprout() bridge.start += self.length_left split.children[self.data[bridge.start]] = bridge if self.half: self.half.link = split self.half = split else: self.node.children[self.data[self.end]] = self.sprout() if self.half: self.half.link = self.node self.half = None self.suffix_left -= 1 if self.node is not self.root: self.node = self.node.link elif self.length_left: self.length_left -= 1 self.end = self.cursor - self.suffix_left + 1
Class variables
var root
var data
var cursor
Methods
def traversable(self, node)
-
Expand source code Browse git
def traversable(self, node): length = node.end - node.start + 1 if self.length_left < length: return False self.node = node self.end += length self.length_left -= length return True
def sprout(self)
-
Expand source code Browse git
def sprout(self) -> Leaf: leaf = Leaf(self, self.cursor) self.leaves.append(leaf) return leaf
def extend(self)
-
Expand source code Browse git
def extend(self): self.suffix_left += 1 self.half = None while self.suffix_left > 0: if not self.length_left: self.end = self.cursor bridge = self.node.children.get(self.data[self.end]) if bridge: if self.traversable(bridge): continue if self.data[bridge.start + self.length_left] == self.data[self.cursor]: if self.half is not None and self.node != self.root: self.half.link = self.node self.half = None self.length_left += 1 break split = Link(self, bridge.start, bridge.start + self.length_left - 1) self.node.children[self.data[self.end]] = split split.children[self.data[self.cursor]] = self.sprout() bridge.start += self.length_left split.children[self.data[bridge.start]] = bridge if self.half: self.half.link = split self.half = split else: self.node.children[self.data[self.end]] = self.sprout() if self.half: self.half.link = self.node self.half = None self.suffix_left -= 1 if self.node is not self.root: self.node = self.node.link elif self.length_left: self.length_left -= 1 self.end = self.cursor - self.suffix_left + 1