Module refinery.units.misc.urlfix
Expand source code Browse git
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from typing import Optional
from urllib.parse import urlparse, urlunparse, parse_qsl, unquote, quote
from refinery.units import Arg, Unit
from refinery.lib.decorators import unicoded
class urlfix(Unit):
"""
Removes fragments, query strings, and parameters from input URLs. It also correctly escapes all
characters in the URL path component and normalizes the network location part to lowercase. Note
that URLs without a scheme will not be recognized as valid URLs; chunks that do not look like a
URL will be swallowed and not return any output.
"""
def __init__(
self,
keep: Arg('-k', action='count', help=(
'If specified once, keeps the it keeps the URL params and query string. If specified '
'twice, it keeps the URL fragment as well. At this level, the unit still filters out '
'anything that does not parse as a URL.'
)) = 0
):
super().__init__(keep=keep)
@unicoded
def process(self, data: str) -> Optional[str]:
def fix(string):
return quote(unquote(string))
keep = self.args.keep
parsed = urlparse(data)
if not parsed.scheme or not parsed.netloc:
return None
new_query = '&'.join(F'{key}={fix(value)}' for key, value in parse_qsl(parsed.query))
replacements = dict(
netloc=parsed.netloc.lower(),
params=fix(parsed.params),
path=fix(parsed.path),
query=new_query,
fragment=fix(parsed.fragment),
)
if keep < 2:
replacements.update(fragment='')
if keep < 1:
replacements.update(params='', query='')
return urlunparse(parsed._replace(**replacements))
Classes
class urlfix (keep=0)
-
Removes fragments, query strings, and parameters from input URLs. It also correctly escapes all characters in the URL path component and normalizes the network location part to lowercase. Note that URLs without a scheme will not be recognized as valid URLs; chunks that do not look like a URL will be swallowed and not return any output.
Expand source code Browse git
class urlfix(Unit): """ Removes fragments, query strings, and parameters from input URLs. It also correctly escapes all characters in the URL path component and normalizes the network location part to lowercase. Note that URLs without a scheme will not be recognized as valid URLs; chunks that do not look like a URL will be swallowed and not return any output. """ def __init__( self, keep: Arg('-k', action='count', help=( 'If specified once, keeps the it keeps the URL params and query string. If specified ' 'twice, it keeps the URL fragment as well. At this level, the unit still filters out ' 'anything that does not parse as a URL.' )) = 0 ): super().__init__(keep=keep) @unicoded def process(self, data: str) -> Optional[str]: def fix(string): return quote(unquote(string)) keep = self.args.keep parsed = urlparse(data) if not parsed.scheme or not parsed.netloc: return None new_query = '&'.join(F'{key}={fix(value)}' for key, value in parse_qsl(parsed.query)) replacements = dict( netloc=parsed.netloc.lower(), params=fix(parsed.params), path=fix(parsed.path), query=new_query, fragment=fix(parsed.fragment), ) if keep < 2: replacements.update(fragment='') if keep < 1: replacements.update(params='', query='') return urlunparse(parsed._replace(**replacements))
Ancestors
Class variables
var required_dependencies
var optional_dependencies
Inherited members