Module `refinery.units.misc.urlfix`

Expand source code Browse git

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from urllib.parse import urlparse, urlunparse, parse_qsl, unquote, quote

from refinery.units import Arg, Unit


class urlfix(Unit):
    """
    Removes fragments, query strings, and parameters from input URLs. It also correctly escapes all
    characters in the URL path component and normalizes the network location part to lowercase. Note
    that URLs without a scheme will not be recognized as valid URLs; chunks that do not look like a
    URL will be swallowed and not return any output.
    """
    def __init__(
        self,
        meta: Arg.Switch('-m', help='Extract the query string parameters as metadata.') = False,
        keep: Arg.Counts('-k', help=(
            'If specified once, keeps the it keeps the URL params and query string. If specified '
            'twice, it keeps the URL fragment as well. At this level, the unit still filters out '
            'anything that does not parse as a URL.'
        )) = 0
    ):
        super().__init__(keep=keep, meta=meta)

    def process(self, data):
        def fix(string):
            return quote(unquote(string))
        keep = self.args.keep
        meta = self.args.meta
        parsed = urlparse(data.decode(self.codec))
        if not parsed.scheme or not parsed.netloc:
            return None
        query_dict = {key: unquote(value) for key, value in parse_qsl(parsed.query)}
        query_string = '&'.join(F'{key}={quote(value)}' for key, value in query_dict.items())
        replacements = dict(
            netloc=parsed.netloc.lower(),
            params=fix(parsed.params),
            path=fix(parsed.path),
            query=query_string,
            fragment=fix(parsed.fragment),
        )
        if keep < 2:
            replacements.update(fragment='')
            if keep < 1:
                replacements.update(params='', query='')
        url = urlunparse(parsed._replace(**replacements))
        url = url.encode(self.codec)
        if meta:
            url = self.labelled(url, **query_dict)
        return url

Classes

class urlfix (meta=False, keep=0)

Removes fragments, query strings, and parameters from input URLs. It also correctly escapes all characters in the URL path component and normalizes the network location part to lowercase. Note that URLs without a scheme will not be recognized as valid URLs; chunks that do not look like a URL will be swallowed and not return any output.

Expand source code Browse git

class urlfix(Unit):
    """
    Removes fragments, query strings, and parameters from input URLs. It also correctly escapes all
    characters in the URL path component and normalizes the network location part to lowercase. Note
    that URLs without a scheme will not be recognized as valid URLs; chunks that do not look like a
    URL will be swallowed and not return any output.
    """
    def __init__(
        self,
        meta: Arg.Switch('-m', help='Extract the query string parameters as metadata.') = False,
        keep: Arg.Counts('-k', help=(
            'If specified once, keeps the it keeps the URL params and query string. If specified '
            'twice, it keeps the URL fragment as well. At this level, the unit still filters out '
            'anything that does not parse as a URL.'
        )) = 0
    ):
        super().__init__(keep=keep, meta=meta)

    def process(self, data):
        def fix(string):
            return quote(unquote(string))
        keep = self.args.keep
        meta = self.args.meta
        parsed = urlparse(data.decode(self.codec))
        if not parsed.scheme or not parsed.netloc:
            return None
        query_dict = {key: unquote(value) for key, value in parse_qsl(parsed.query)}
        query_string = '&'.join(F'{key}={quote(value)}' for key, value in query_dict.items())
        replacements = dict(
            netloc=parsed.netloc.lower(),
            params=fix(parsed.params),
            path=fix(parsed.path),
            query=query_string,
            fragment=fix(parsed.fragment),
        )
        if keep < 2:
            replacements.update(fragment='')
            if keep < 1:
                replacements.update(params='', query='')
        url = urlunparse(parsed._replace(**replacements))
        url = url.encode(self.codec)
        if meta:
            url = self.labelled(url, **query_dict)
        return url

Ancestors

Class variables

var required_dependencies
var optional_dependencies
var console
var reverse

Inherited members

Unit:
- Arg
- assemble
- finish
- handles
- is_quiet
- labelled
- leniency
- log_always
- log_debug
- log_detach
- log_fail
- log_info
- log_level
- log_warn
- nozzle
- read
- read1
- run
- source
- superinit
UnitBase:
- process
UnitBase:
- filter