from __future__ import annotations import re import unicodedata from collections.abc import Iterable from html.entities import name2codepoint try: import unidecode except ImportError: import text_unidecode as unidecode __all__ = ['slugify', 'smart_truncate'] CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint)) DECIMAL_PATTERN = re.compile(r'&#(\d+);') HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);') QUOTE_PATTERN = re.compile(r'[\']+') DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+') DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+') DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}') NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)') DEFAULT_SEPARATOR = '-' def smart_truncate( string: str, max_length: int = 0, word_boundary: bool = False, separator: str = " ", save_order: bool = False, ) -> str: """ Truncate a string. :param string (str): string for modification :param max_length (int): output string length :param word_boundary (bool): :param save_order (bool): if True then word order of output string is like input string :param separator (str): separator between words :return: """ string = string.strip(separator) if not max_length: return string if len(string) < max_length: return string if not word_boundary: return string[:max_length].strip(separator) if separator not in string: return string[:max_length] truncated = '' for word in string.split(separator): if word: next_len = len(truncated) + len(word) if next_len < max_length: truncated += '{}{}'.format(word, separator) elif next_len == max_length: truncated += '{}'.format(word) break else: if save_order: break if not truncated: # pragma: no cover truncated = string[:max_length] return truncated.strip(separator) def slugify( text: str, entities: bool = True, decimal: bool = True, hexadecimal: bool = True, max_length: int = 0, word_boundary: bool = False, separator: str = DEFAULT_SEPARATOR, save_order: bool = False, stopwords: Iterable[str] = (), regex_pattern: re.Pattern[str] | str | None = None, lowercase: bool = True, replacements: Iterable[Iterable[str]] = (), allow_unicode: bool = False, ) -> str: """ Make a slug from the given text. :param text (str): initial text :param entities (bool): converts html entities to unicode :param decimal (bool): converts html decimal to unicode :param hexadecimal (bool): converts html hexadecimal to unicode :param max_length (int): output string length :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order :param separator (str): separator between words :param stopwords (iterable): words to discount :param regex_pattern (str): regex pattern for disallowed characters :param lowercase (bool): activate case sensitivity by setting it to False :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']] :param allow_unicode (bool): allow unicode characters :return (str): """ # user-specific replacements if replacements: for old, new in replacements: text = text.replace(old, new) # ensure text is unicode if not isinstance(text, str): text = str(text, 'utf-8', 'ignore') # replace quotes with dashes - pre-process text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text) # normalize text, convert to unicode if required if allow_unicode: text = unicodedata.normalize('NFKC', text) else: text = unicodedata.normalize('NFKD', text) text = unidecode.unidecode(text) # ensure text is still in unicode if not isinstance(text, str): text = str(text, 'utf-8', 'ignore') # character entity reference if entities: text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text) # decimal character reference if decimal: try: text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text) except Exception: pass # hexadecimal character reference if hexadecimal: try: text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text) except Exception: pass # re normalize text if allow_unicode: text = unicodedata.normalize('NFKC', text) else: text = unicodedata.normalize('NFKD', text) # make the text lowercase (optional) if lowercase: text = text.lower() # remove generated quotes -- post-process text = QUOTE_PATTERN.sub('', text) # cleanup numbers text = NUMBERS_PATTERN.sub('', text) # replace all other unwanted characters if allow_unicode: pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN else: pattern = regex_pattern or DISALLOWED_CHARS_PATTERN text = re.sub(pattern, DEFAULT_SEPARATOR, text) # remove redundant text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR) # remove stopwords if stopwords: if lowercase: stopwords_lower = [s.lower() for s in stopwords] words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower] else: words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords] text = DEFAULT_SEPARATOR.join(words) # finalize user-specific replacements if replacements: for old, new in replacements: text = text.replace(old, new) # smart truncate if requested if max_length > 0: text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order) if separator != DEFAULT_SEPARATOR: text = text.replace(DEFAULT_SEPARATOR, separator) return text