mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2025-06-08 07:07:14 +00:00
271 lines
7.4 KiB
Python
271 lines
7.4 KiB
Python
# basic text cleaners for the ACE step model
|
|
# I didn't copy the ones from the reference code because I didn't want to deal with the dependencies
|
|
# TODO: more languages than english?
|
|
|
|
import re
|
|
|
|
def number_to_text(num, ordinal=False):
|
|
"""
|
|
Convert a number (int or float) to its text representation.
|
|
|
|
Args:
|
|
num: The number to convert
|
|
|
|
Returns:
|
|
str: Text representation of the number
|
|
"""
|
|
|
|
if not isinstance(num, (int, float)):
|
|
return "Input must be a number"
|
|
|
|
# Handle special case of zero
|
|
if num == 0:
|
|
return "zero"
|
|
|
|
# Handle negative numbers
|
|
negative = num < 0
|
|
num = abs(num)
|
|
|
|
# Handle floats
|
|
if isinstance(num, float):
|
|
# Split into integer and decimal parts
|
|
int_part = int(num)
|
|
|
|
# Convert both parts
|
|
int_text = _int_to_text(int_part)
|
|
|
|
# Handle decimal part (convert to string and remove '0.')
|
|
decimal_str = str(num).split('.')[1]
|
|
decimal_text = " point " + " ".join(_digit_to_text(int(digit)) for digit in decimal_str)
|
|
|
|
result = int_text + decimal_text
|
|
else:
|
|
# Handle integers
|
|
result = _int_to_text(num)
|
|
|
|
# Add 'negative' prefix for negative numbers
|
|
if negative:
|
|
result = "negative " + result
|
|
|
|
return result
|
|
|
|
|
|
def _int_to_text(num):
|
|
"""Helper function to convert an integer to text"""
|
|
|
|
ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
|
|
"ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
|
|
"seventeen", "eighteen", "nineteen"]
|
|
|
|
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
|
|
|
|
if num < 20:
|
|
return ones[num]
|
|
|
|
if num < 100:
|
|
return tens[num // 10] + (" " + ones[num % 10] if num % 10 != 0 else "")
|
|
|
|
if num < 1000:
|
|
return ones[num // 100] + " hundred" + (" " + _int_to_text(num % 100) if num % 100 != 0 else "")
|
|
|
|
if num < 1000000:
|
|
return _int_to_text(num // 1000) + " thousand" + (" " + _int_to_text(num % 1000) if num % 1000 != 0 else "")
|
|
|
|
if num < 1000000000:
|
|
return _int_to_text(num // 1000000) + " million" + (" " + _int_to_text(num % 1000000) if num % 1000000 != 0 else "")
|
|
|
|
return _int_to_text(num // 1000000000) + " billion" + (" " + _int_to_text(num % 1000000000) if num % 1000000000 != 0 else "")
|
|
|
|
|
|
def _digit_to_text(digit):
|
|
"""Convert a single digit to text"""
|
|
digits = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
|
|
return digits[digit]
|
|
|
|
|
|
_whitespace_re = re.compile(r"\s+")
|
|
|
|
|
|
# List of (regular expression, replacement) pairs for abbreviations:
|
|
_abbreviations = {
|
|
"en": [
|
|
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
|
for x in [
|
|
("mrs", "misess"),
|
|
("mr", "mister"),
|
|
("dr", "doctor"),
|
|
("st", "saint"),
|
|
("co", "company"),
|
|
("jr", "junior"),
|
|
("maj", "major"),
|
|
("gen", "general"),
|
|
("drs", "doctors"),
|
|
("rev", "reverend"),
|
|
("lt", "lieutenant"),
|
|
("hon", "honorable"),
|
|
("sgt", "sergeant"),
|
|
("capt", "captain"),
|
|
("esq", "esquire"),
|
|
("ltd", "limited"),
|
|
("col", "colonel"),
|
|
("ft", "fort"),
|
|
]
|
|
],
|
|
}
|
|
|
|
|
|
def expand_abbreviations_multilingual(text, lang="en"):
|
|
for regex, replacement in _abbreviations[lang]:
|
|
text = re.sub(regex, replacement, text)
|
|
return text
|
|
|
|
|
|
_symbols_multilingual = {
|
|
"en": [
|
|
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
|
|
for x in [
|
|
("&", " and "),
|
|
("@", " at "),
|
|
("%", " percent "),
|
|
("#", " hash "),
|
|
("$", " dollar "),
|
|
("£", " pound "),
|
|
("°", " degree "),
|
|
]
|
|
],
|
|
}
|
|
|
|
|
|
def expand_symbols_multilingual(text, lang="en"):
|
|
for regex, replacement in _symbols_multilingual[lang]:
|
|
text = re.sub(regex, replacement, text)
|
|
text = text.replace(" ", " ") # Ensure there are no double spaces
|
|
return text.strip()
|
|
|
|
|
|
_ordinal_re = {
|
|
"en": re.compile(r"([0-9]+)(st|nd|rd|th)"),
|
|
}
|
|
_number_re = re.compile(r"[0-9]+")
|
|
_currency_re = {
|
|
"USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"),
|
|
"GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"),
|
|
"EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"),
|
|
}
|
|
|
|
_comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b")
|
|
_dot_number_re = re.compile(r"\b\d{1,3}(.\d{3})*(\,\d+)?\b")
|
|
_decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)")
|
|
|
|
|
|
def _remove_commas(m):
|
|
text = m.group(0)
|
|
if "," in text:
|
|
text = text.replace(",", "")
|
|
return text
|
|
|
|
|
|
def _remove_dots(m):
|
|
text = m.group(0)
|
|
if "." in text:
|
|
text = text.replace(".", "")
|
|
return text
|
|
|
|
|
|
def _expand_decimal_point(m, lang="en"):
|
|
amount = m.group(1).replace(",", ".")
|
|
return number_to_text(float(amount))
|
|
|
|
|
|
def _expand_currency(m, lang="en", currency="USD"):
|
|
amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))))
|
|
full_amount = number_to_text(amount)
|
|
|
|
and_equivalents = {
|
|
"en": ", ",
|
|
"es": " con ",
|
|
"fr": " et ",
|
|
"de": " und ",
|
|
"pt": " e ",
|
|
"it": " e ",
|
|
"pl": ", ",
|
|
"cs": ", ",
|
|
"ru": ", ",
|
|
"nl": ", ",
|
|
"ar": ", ",
|
|
"tr": ", ",
|
|
"hu": ", ",
|
|
"ko": ", ",
|
|
}
|
|
|
|
if amount.is_integer():
|
|
last_and = full_amount.rfind(and_equivalents[lang])
|
|
if last_and != -1:
|
|
full_amount = full_amount[:last_and]
|
|
|
|
return full_amount
|
|
|
|
|
|
def _expand_ordinal(m, lang="en"):
|
|
return number_to_text(int(m.group(1)), ordinal=True)
|
|
|
|
|
|
def _expand_number(m, lang="en"):
|
|
return number_to_text(int(m.group(0)))
|
|
|
|
|
|
def expand_numbers_multilingual(text, lang="en"):
|
|
if lang in ["en", "ru"]:
|
|
text = re.sub(_comma_number_re, _remove_commas, text)
|
|
else:
|
|
text = re.sub(_dot_number_re, _remove_dots, text)
|
|
try:
|
|
text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text)
|
|
text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text)
|
|
text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text)
|
|
except:
|
|
pass
|
|
|
|
text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text)
|
|
text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text)
|
|
text = re.sub(_number_re, lambda m: _expand_number(m, lang), text)
|
|
return text
|
|
|
|
|
|
def lowercase(text):
|
|
return text.lower()
|
|
|
|
|
|
def collapse_whitespace(text):
|
|
return re.sub(_whitespace_re, " ", text)
|
|
|
|
|
|
def multilingual_cleaners(text, lang):
|
|
text = text.replace('"', "")
|
|
if lang == "tr":
|
|
text = text.replace("İ", "i")
|
|
text = text.replace("Ö", "ö")
|
|
text = text.replace("Ü", "ü")
|
|
text = lowercase(text)
|
|
try:
|
|
text = expand_numbers_multilingual(text, lang)
|
|
except:
|
|
pass
|
|
try:
|
|
text = expand_abbreviations_multilingual(text, lang)
|
|
except:
|
|
pass
|
|
try:
|
|
text = expand_symbols_multilingual(text, lang=lang)
|
|
except:
|
|
pass
|
|
text = collapse_whitespace(text)
|
|
return text
|
|
|
|
|
|
def basic_cleaners(text):
|
|
"""Basic pipeline that lowercases and collapses whitespace without transliteration."""
|
|
text = lowercase(text)
|
|
text = collapse_whitespace(text)
|
|
return text
|