198 lines
6.0 KiB
Python
198 lines
6.0 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
import unicodedata
|
|
from collections.abc import Iterable
|
|
from html.entities import name2codepoint
|
|
|
|
try:
|
|
import unidecode
|
|
except ImportError:
|
|
import text_unidecode as unidecode
|
|
|
|
__all__ = ['slugify', 'smart_truncate']
|
|
|
|
|
|
CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
|
|
DECIMAL_PATTERN = re.compile(r'&#(\d+);')
|
|
HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
|
|
QUOTE_PATTERN = re.compile(r'[\']+')
|
|
DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')
|
|
DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')
|
|
DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
|
|
NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
|
|
DEFAULT_SEPARATOR = '-'
|
|
|
|
|
|
def smart_truncate(
|
|
string: str,
|
|
max_length: int = 0,
|
|
word_boundary: bool = False,
|
|
separator: str = " ",
|
|
save_order: bool = False,
|
|
) -> str:
|
|
"""
|
|
Truncate a string.
|
|
:param string (str): string for modification
|
|
:param max_length (int): output string length
|
|
:param word_boundary (bool):
|
|
:param save_order (bool): if True then word order of output string is like input string
|
|
:param separator (str): separator between words
|
|
:return:
|
|
"""
|
|
|
|
string = string.strip(separator)
|
|
|
|
if not max_length:
|
|
return string
|
|
|
|
if len(string) < max_length:
|
|
return string
|
|
|
|
if not word_boundary:
|
|
return string[:max_length].strip(separator)
|
|
|
|
if separator not in string:
|
|
return string[:max_length]
|
|
|
|
truncated = ''
|
|
for word in string.split(separator):
|
|
if word:
|
|
next_len = len(truncated) + len(word)
|
|
if next_len < max_length:
|
|
truncated += '{}{}'.format(word, separator)
|
|
elif next_len == max_length:
|
|
truncated += '{}'.format(word)
|
|
break
|
|
else:
|
|
if save_order:
|
|
break
|
|
if not truncated: # pragma: no cover
|
|
truncated = string[:max_length]
|
|
return truncated.strip(separator)
|
|
|
|
|
|
def slugify(
|
|
text: str,
|
|
entities: bool = True,
|
|
decimal: bool = True,
|
|
hexadecimal: bool = True,
|
|
max_length: int = 0,
|
|
word_boundary: bool = False,
|
|
separator: str = DEFAULT_SEPARATOR,
|
|
save_order: bool = False,
|
|
stopwords: Iterable[str] = (),
|
|
regex_pattern: re.Pattern[str] | str | None = None,
|
|
lowercase: bool = True,
|
|
replacements: Iterable[Iterable[str]] = (),
|
|
allow_unicode: bool = False,
|
|
) -> str:
|
|
"""
|
|
Make a slug from the given text.
|
|
:param text (str): initial text
|
|
:param entities (bool): converts html entities to unicode
|
|
:param decimal (bool): converts html decimal to unicode
|
|
:param hexadecimal (bool): converts html hexadecimal to unicode
|
|
:param max_length (int): output string length
|
|
:param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
|
|
:param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order
|
|
:param separator (str): separator between words
|
|
:param stopwords (iterable): words to discount
|
|
:param regex_pattern (str): regex pattern for disallowed characters
|
|
:param lowercase (bool): activate case sensitivity by setting it to False
|
|
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
|
|
:param allow_unicode (bool): allow unicode characters
|
|
:return (str):
|
|
"""
|
|
|
|
# user-specific replacements
|
|
if replacements:
|
|
for old, new in replacements:
|
|
text = text.replace(old, new)
|
|
|
|
# ensure text is unicode
|
|
if not isinstance(text, str):
|
|
text = str(text, 'utf-8', 'ignore')
|
|
|
|
# replace quotes with dashes - pre-process
|
|
text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
|
|
|
|
# normalize text, convert to unicode if required
|
|
if allow_unicode:
|
|
text = unicodedata.normalize('NFKC', text)
|
|
else:
|
|
text = unicodedata.normalize('NFKD', text)
|
|
text = unidecode.unidecode(text)
|
|
|
|
# ensure text is still in unicode
|
|
if not isinstance(text, str):
|
|
text = str(text, 'utf-8', 'ignore')
|
|
|
|
# character entity reference
|
|
if entities:
|
|
text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text)
|
|
|
|
# decimal character reference
|
|
if decimal:
|
|
try:
|
|
text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text)
|
|
except Exception:
|
|
pass
|
|
|
|
# hexadecimal character reference
|
|
if hexadecimal:
|
|
try:
|
|
text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text)
|
|
except Exception:
|
|
pass
|
|
|
|
# re normalize text
|
|
if allow_unicode:
|
|
text = unicodedata.normalize('NFKC', text)
|
|
else:
|
|
text = unicodedata.normalize('NFKD', text)
|
|
|
|
# make the text lowercase (optional)
|
|
if lowercase:
|
|
text = text.lower()
|
|
|
|
# remove generated quotes -- post-process
|
|
text = QUOTE_PATTERN.sub('', text)
|
|
|
|
# cleanup numbers
|
|
text = NUMBERS_PATTERN.sub('', text)
|
|
|
|
# replace all other unwanted characters
|
|
if allow_unicode:
|
|
pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
|
|
else:
|
|
pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
|
|
|
|
text = re.sub(pattern, DEFAULT_SEPARATOR, text)
|
|
|
|
# remove redundant
|
|
text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)
|
|
|
|
# remove stopwords
|
|
if stopwords:
|
|
if lowercase:
|
|
stopwords_lower = [s.lower() for s in stopwords]
|
|
words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]
|
|
else:
|
|
words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]
|
|
text = DEFAULT_SEPARATOR.join(words)
|
|
|
|
# finalize user-specific replacements
|
|
if replacements:
|
|
for old, new in replacements:
|
|
text = text.replace(old, new)
|
|
|
|
# smart truncate if requested
|
|
if max_length > 0:
|
|
text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)
|
|
|
|
if separator != DEFAULT_SEPARATOR:
|
|
text = text.replace(DEFAULT_SEPARATOR, separator)
|
|
|
|
return text
|