PNG  IHDRxsBIT|d pHYs+tEXtSoftwarewww.inkscape.org<,tEXtComment File Manager

File Manager

Path: /opt/cloudlinux/venv/lib/python3.11/site-packages/charset_normalizer/

Viewing File: utils.py

try:
    # WARNING: unicodedata2 support is going to be removed in 3.0
    # Python is quickly catching up.
    import unicodedata2 as unicodedata
except ImportError:
    import unicodedata  # type: ignore[no-redef]

import importlib
import logging
from codecs import IncrementalDecoder
from encodings.aliases import aliases
from functools import lru_cache
from re import findall
from typing import Generator, List, Optional, Set, Tuple, Union

from _multibytecodec import MultibyteIncrementalDecoder

from .constant import (
    ENCODING_MARKS,
    IANA_SUPPORTED_SIMILAR,
    RE_POSSIBLE_ENCODING_INDICATION,
    UNICODE_RANGES_COMBINED,
    UNICODE_SECONDARY_RANGE_KEYWORD,
    UTF8_MAXIMAL_ALLOCATION,
)


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_accentuated(character: str) -> bool:
    try:
        description: str = unicodedata.name(character)
    except ValueError:
        return False
    return (
        "WITH GRAVE" in description
        or "WITH ACUTE" in description
        or "WITH CEDILLA" in description
        or "WITH DIAERESIS" in description
        or "WITH CIRCUMFLEX" in description
        or "WITH TILDE" in description
    )


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def remove_accent(character: str) -> str:
    decomposed: str = unicodedata.decomposition(character)
    if not decomposed:
        return character

    codes: List[str] = decomposed.split(" ")

    return chr(int(codes[0], 16))


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def unicode_range(character: str) -> Optional[str]:
    """
    Retrieve the Unicode range official name from a single character.
    """
    character_ord: int = ord(character)

    for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
        if character_ord in ord_range:
            return range_name

    return None


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_latin(character: str) -> bool:
    try:
        description: str = unicodedata.name(character)
    except ValueError:
        return False
    return "LATIN" in description


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_ascii(character: str) -> bool:
    try:
        character.encode("ascii")
    except UnicodeEncodeError:
        return False
    return True


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_punctuation(character: str) -> bool:
    character_category: str = unicodedata.category(character)

    if "P" in character_category:
        return True

    character_range: Optional[str] = unicode_range(character)

    if character_range is None:
        return False

    return "Punctuation" in character_range


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_symbol(character: str) -> bool:
    character_category: str = unicodedata.category(character)

    if "S" in character_category or "N" in character_category:
        return True

    character_range: Optional[str] = unicode_range(character)

    if character_range is None:
        return False

    return "Forms" in character_range


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_emoticon(character: str) -> bool:
    character_range: Optional[str] = unicode_range(character)

    if character_range is None:
        return False

    return "Emoticons" in character_range


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_separator(character: str) -> bool:
    if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
        return True

    character_category: str = unicodedata.category(character)

    return "Z" in character_category


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_case_variable(character: str) -> bool:
    return character.islower() != character.isupper()


def is_private_use_only(character: str) -> bool:
    character_category: str = unicodedata.category(character)

    return character_category == "Co"


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_cjk(character: str) -> bool:
    try:
        character_name = unicodedata.name(character)
    except ValueError:
        return False

    return "CJK" in character_name


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hiragana(character: str) -> bool:
    try:
        character_name = unicodedata.name(character)
    except ValueError:
        return False

    return "HIRAGANA" in character_name


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_katakana(character: str) -> bool:
    try:
        character_name = unicodedata.name(character)
    except ValueError:
        return False

    return "KATAKANA" in character_name


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hangul(character: str) -> bool:
    try:
        character_name = unicodedata.name(character)
    except ValueError:
        return False

    return "HANGUL" in character_name


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_thai(character: str) -> bool:
    try:
        character_name = unicodedata.name(character)
    except ValueError:
        return False

    return "THAI" in character_name


@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
def is_unicode_range_secondary(range_name: str) -> bool:
    return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_unprintable(character: str) -> bool:
    return (
        character.isspace() is False  # includes \n \t \r \v
        and character.isprintable() is False
        and character != "\x1A"  # Why? Its the ASCII substitute character.
        and character != "\ufeff"  # bug discovered in Python,
        # Zero Width No-Break Space located in 	Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
    )


def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
    """
    Extract using ASCII-only decoder any specified encoding in the first n-bytes.
    """
    if not isinstance(sequence, bytes):
        raise TypeError

    seq_len: int = len(sequence)

    results: List[str] = findall(
        RE_POSSIBLE_ENCODING_INDICATION,
        sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
    )

    if len(results) == 0:
        return None

    for specified_encoding in results:
        specified_encoding = specified_encoding.lower().replace("-", "_")

        encoding_alias: str
        encoding_iana: str

        for encoding_alias, encoding_iana in aliases.items():
            if encoding_alias == specified_encoding:
                return encoding_iana
            if encoding_iana == specified_encoding:
                return encoding_iana

    return None


@lru_cache(maxsize=128)
def is_multi_byte_encoding(name: str) -> bool:
    """
    Verify is a specific encoding is a multi byte one based on it IANA name
    """
    return name in {
        "utf_8",
        "utf_8_sig",
        "utf_16",
        "utf_16_be",
        "utf_16_le",
        "utf_32",
        "utf_32_le",
        "utf_32_be",
        "utf_7",
    } or issubclass(
        importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
        MultibyteIncrementalDecoder,
    )


def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
    """
    Identify and extract SIG/BOM in given sequence.
    """

    for iana_encoding in ENCODING_MARKS:
        marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]

        if isinstance(marks, bytes):
            marks = [marks]

        for mark in marks:
            if sequence.startswith(mark):
                return iana_encoding, mark

    return None, b""


def should_strip_sig_or_bom(iana_encoding: str) -> bool:
    return iana_encoding not in {"utf_16", "utf_32"}


def iana_name(cp_name: str, strict: bool = True) -> str:
    cp_name = cp_name.lower().replace("-", "_")

    encoding_alias: str
    encoding_iana: str

    for encoding_alias, encoding_iana in aliases.items():
        if cp_name in [encoding_alias, encoding_iana]:
            return encoding_iana

    if strict:
        raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))

    return cp_name


def range_scan(decoded_sequence: str) -> List[str]:
    ranges: Set[str] = set()

    for character in decoded_sequence:
        character_range: Optional[str] = unicode_range(character)

        if character_range is None:
            continue

        ranges.add(character_range)

    return list(ranges)


def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:

    if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
        return 0.0

    decoder_a = importlib.import_module(
        "encodings.{}".format(iana_name_a)
    ).IncrementalDecoder
    decoder_b = importlib.import_module(
        "encodings.{}".format(iana_name_b)
    ).IncrementalDecoder

    id_a: IncrementalDecoder = decoder_a(errors="ignore")
    id_b: IncrementalDecoder = decoder_b(errors="ignore")

    character_match_count: int = 0

    for i in range(255):
        to_be_decoded: bytes = bytes([i])
        if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
            character_match_count += 1

    return character_match_count / 254


def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
    """
    Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
    the function cp_similarity.
    """
    return (
        iana_name_a in IANA_SUPPORTED_SIMILAR
        and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
    )


def set_logging_handler(
    name: str = "charset_normalizer",
    level: int = logging.INFO,
    format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
) -> None:

    logger = logging.getLogger(name)
    logger.setLevel(level)

    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter(format_string))
    logger.addHandler(handler)


def cut_sequence_chunks(
    sequences: bytes,
    encoding_iana: str,
    offsets: range,
    chunk_size: int,
    bom_or_sig_available: bool,
    strip_sig_or_bom: bool,
    sig_payload: bytes,
    is_multi_byte_decoder: bool,
    decoded_payload: Optional[str] = None,
) -> Generator[str, None, None]:

    if decoded_payload and is_multi_byte_decoder is False:
        for i in offsets:
            chunk = decoded_payload[i : i + chunk_size]
            if not chunk:
                break
            yield chunk
    else:
        for i in offsets:
            chunk_end = i + chunk_size
            if chunk_end > len(sequences) + 8:
                continue

            cut_sequence = sequences[i : i + chunk_size]

            if bom_or_sig_available and strip_sig_or_bom is False:
                cut_sequence = sig_payload + cut_sequence

            chunk = cut_sequence.decode(
                encoding_iana,
                errors="ignore" if is_multi_byte_decoder else "strict",
            )

            # multi-byte bad cutting detector and adjustment
            # not the cleanest way to perform that fix but clever enough for now.
            if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:

                chunk_partial_size_chk: int = min(chunk_size, 16)

                if (
                    decoded_payload
                    and chunk[:chunk_partial_size_chk] not in decoded_payload
                ):
                    for j in range(i, i - 4, -1):
                        cut_sequence = sequences[j:chunk_end]

                        if bom_or_sig_available and strip_sig_or_bom is False:
                            cut_sequence = sig_payload + cut_sequence

                        chunk = cut_sequence.decode(encoding_iana, errors="ignore")

                        if chunk[:chunk_partial_size_chk] in decoded_payload:
                            break

            yield chunk
b IDATxytVսϓ22 A@IR :hCiZ[v*E:WũZA ^dQeQ @ !jZ'>gsV仿$|?g)&x-EIENT ;@xT.i%-X}SvS5.r/UHz^_$-W"w)Ɗ/@Z &IoX P$K}JzX:;` &, ŋui,e6mX ԵrKb1ԗ)DADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADA݀!I*]R;I2$eZ#ORZSrr6mteffu*((Pu'v{DIߔ4^pIm'77WEEE;vƎ4-$]'RI{\I&G :IHJ DWBB=\WR޽m o$K(V9ABB.}jѢv`^?IOȅ} ڶmG}T#FJ`56$-ھ}FI&v;0(h;Б38CӧOWf!;A i:F_m9s&|q%=#wZprrrla A &P\\СC[A#! {olF} `E2}MK/vV)i{4BffV\|ۭX`b@kɶ@%i$K z5zhmX[IXZ` 'b%$r5M4º/l ԃߖxhʔ)[@=} K6IM}^5k㏷݆z ΗÿO:gdGBmyT/@+Vɶ纽z񕏵l.y޴it뭷zV0[Y^>Wsqs}\/@$(T7f.InݺiR$푔n.~?H))\ZRW'Mo~v Ov6oԃxz! S,&xm/yɞԟ?'uaSѽb,8GלKboi&3t7Y,)JJ c[nzӳdE&KsZLӄ I?@&%ӟ۶mSMMњ0iؐSZ,|J+N ~,0A0!5%Q-YQQa3}$_vVrf9f?S8`zDADADADADADADADADAdqP,تmMmg1V?rSI꒟]u|l RCyEf٢9 jURbztѰ!m5~tGj2DhG*{H9)꒟ר3:(+3\?/;TUݭʴ~S6lڧUJ*i$d(#=Yݺd{,p|3B))q:vN0Y.jkק6;SɶVzHJJЀ-utѹսk>QUU\޲~]fFnK?&ߡ5b=z9)^|u_k-[y%ZNU6 7Mi:]ۦtk[n X(e6Bb."8cۭ|~teuuw|ήI-5"~Uk;ZicEmN/:]M> cQ^uiƞ??Ңpc#TUU3UakNwA`:Y_V-8.KKfRitv޲* 9S6ֿj,ՃNOMߤ]z^fOh|<>@Å5 _/Iu?{SY4hK/2]4%it5q]GGe2%iR| W&f*^]??vq[LgE_3f}Fxu~}qd-ږFxu~I N>\;͗O֊:̗WJ@BhW=y|GgwܷH_NY?)Tdi'?խwhlmQi !SUUsw4kӺe4rfxu-[nHtMFj}H_u~w>)oV}(T'ebʒv3_[+vn@Ȭ\S}ot}w=kHFnxg S 0eޢm~l}uqZfFoZuuEg `zt~? b;t%>WTkķh[2eG8LIWx,^\thrl^Ϊ{=dž<}qV@ ⠨Wy^LF_>0UkDuʫuCs$)Iv:IK;6ֲ4{^6եm+l3>݆uM 9u?>Zc }g~qhKwڭeFMM~pМuqǿz6Tb@8@Y|jx](^]gf}M"tG -w.@vOqh~/HII`S[l.6nØXL9vUcOoB\xoǤ'T&IǍQw_wpv[kmO{w~>#=P1Pɞa-we:iǏlHo׈꒟f9SzH?+shk%Fs:qVhqY`jvO'ρ?PyX3lх]˾uV{ݞ]1,MzYNW~̈́ joYn}ȚF߾׮mS]F z+EDxm/d{F{-W-4wY듏:??_gPf ^3ecg ҵs8R2מz@TANGj)}CNi/R~}c:5{!ZHӋӾ6}T]G]7W6^n 9*,YqOZj:P?Q DFL|?-^.Ɵ7}fFh׶xe2Pscz1&5\cn[=Vn[ĶE鎀uˌd3GII k;lNmشOuuRVfBE]ۣeӶu :X-[(er4~LHi6:Ѻ@ԅrST0trk%$Č0ez" *z"T/X9|8.C5Feg}CQ%͞ˣJvL/?j^h&9xF`њZ(&yF&Iݻfg#W;3^{Wo^4'vV[[K';+mӍִ]AC@W?1^{එyh +^]fm~iԵ]AB@WTk̏t uR?l.OIHiYyԶ]Aˀ7c:q}ힽaf6Z~қm(+sK4{^6}T*UUu]n.:kx{:2 _m=sAߤU@?Z-Vކеz왍Nэ{|5 pڶn b p-@sPg]0G7fy-M{GCF'%{4`=$-Ge\ eU:m+Zt'WjO!OAF@ik&t݆ϥ_ e}=]"Wz_.͜E3leWFih|t-wZۍ-uw=6YN{6|} |*={Ѽn.S.z1zjۻTH]흾 DuDvmvK.`V]yY~sI@t?/ϓ. m&["+P?MzovVЫG3-GRR[(!!\_,^%?v@ҵő m`Y)tem8GMx.))A]Y i`ViW`?^~!S#^+ѽGZj?Vģ0.))A꨷lzL*]OXrY`DBBLOj{-MH'ii-ϰ ok7^ )쭡b]UXSְmռY|5*cֽk0B7镹%ڽP#8nȎq}mJr23_>lE5$iwui+ H~F`IjƵ@q \ @#qG0".0" l`„.0! ,AQHN6qzkKJ#o;`Xv2>,tێJJ7Z/*A .@fفjMzkg @TvZH3Zxu6Ra'%O?/dQ5xYkU]Rֽkق@DaS^RSּ5|BeHNN͘p HvcYcC5:y #`οb;z2.!kr}gUWkyZn=f Pvsn3p~;4p˚=ē~NmI] ¾ 0lH[_L hsh_ғߤc_њec)g7VIZ5yrgk̞W#IjӪv>՞y睝M8[|]\շ8M6%|@PZڨI-m>=k='aiRo-x?>Q.}`Ȏ:Wsmu u > .@,&;+!!˱tﭧDQwRW\vF\~Q7>spYw$%A~;~}6¾ g&if_=j,v+UL1(tWake:@Ș>j$Gq2t7S?vL|]u/ .(0E6Mk6hiۺzښOrifޱxm/Gx> Lal%%~{lBsR4*}{0Z/tNIɚpV^#Lf:u@k#RSu =S^ZyuR/.@n&΃z~B=0eg뺆#,Þ[B/?H uUf7y Wy}Bwegל`Wh(||`l`.;Ws?V@"c:iɍL֯PGv6zctM̠':wuW;d=;EveD}9J@B(0iհ bvP1{\P&G7D޴Iy_$-Qjm~Yrr&]CDv%bh|Yzni_ˆR;kg}nJOIIwyuL}{ЌNj}:+3Y?:WJ/N+Rzd=hb;dj͒suݔ@NKMԄ jqzC5@y°hL m;*5ezᕏ=ep XL n?מ:r`۵tŤZ|1v`V뽧_csج'ߤ%oTuumk%%%h)uy]Nk[n 'b2 l.=͜E%gf$[c;s:V-͞WߤWh-j7]4=F-X]>ZLSi[Y*We;Zan(ӇW|e(HNNP5[= r4tP &0<pc#`vTNV GFqvTi*Tyam$ߏWyE*VJKMTfFw>'$-ؽ.Ho.8c"@DADADADADADADADADA~j*֘,N;Pi3599h=goضLgiJ5փy~}&Zd9p֚ e:|hL``b/d9p? fgg+%%hMgXosج, ΩOl0Zh=xdjLmhݻoO[g_l,8a]٭+ӧ0$I]c]:粹:Teꢢ"5a^Kgh,&= =՟^߶“ߢE ܹS J}I%:8 IDAT~,9/ʃPW'Mo}zNƍ쨓zPbNZ~^z=4mswg;5 Y~SVMRXUյڱRf?s:w ;6H:ºi5-maM&O3;1IKeamZh͛7+##v+c ~u~ca]GnF'ټL~PPPbn voC4R,ӟgg %hq}@#M4IÇ Oy^xMZx ) yOw@HkN˖-Sǎmb]X@n+i͖!++K3gd\$mt$^YfJ\8PRF)77Wא!Cl$i:@@_oG I{$# 8磌ŋ91A (Im7֭>}ߴJq7ޗt^ -[ԩSj*}%]&' -ɓ'ꫯVzzvB#;a 7@GxI{j޼ƌ.LÇWBB7`O"I$/@R @eee@۷>}0,ɒ2$53Xs|cS~rpTYYY} kHc %&k.], @ADADADADADADADADA@lT<%''*Lo^={رc5h %$+CnܸQ3fҥK}vUVVs9G R,_{xˇ3o߾;TTTd}馛]uuuG~iԩ@4bnvmvfϞ /Peeeq}}za I~,誫{UWW뮻}_~YƍSMMMYχ֝waw\ďcxꩧtEƍկ_?۷5@u?1kNׯWzz/wy>}zj3 k(ٺuq_Zvf̘:~ ABQ&r|!%KҥKgԞ={<_X-z !CyFUUz~ ABQIIIjݺW$UXXDٳZ~ ABQƍecW$<(~<RSSvZujjjԧOZQu@4 8m&&&jԩg$ď1h ͟?_{768@g =@`)))5o6m3)ѣƌJ;wҿUTT /KZR{~a=@0o<*狔iFɶ[ˎ;T]]OX@?K.ۈxN pppppppppppppppppPfl߾] ,{ァk۶mڿo5BTӦMӴiӴ|r DB2e|An!Dy'tkΝ[A $***t5' "!駟oaDnΝ:t֭[gDШQ06qD;@ x M6v(PiizmZ4ew"@̴ixf [~-Fٱc&IZ2|n!?$@{[HTɏ#@hȎI# _m(F /6Z3z'\r,r!;w2Z3j=~GY7"I$iI.p_"?pN`y DD?: _  Gÿab7J !Bx@0 Bo cG@`1C[@0G @`0C_u V1 aCX>W ` | `!<S `"<. `#c`?cAC4 ?c p#~@0?:08&_MQ1J h#?/`7;I  q 7a wQ A 1 Hp !#<8/#@1Ul7=S=K.4Z?E_$i@!1!E4?`P_  @Bă10#: "aU,xbFY1 [n|n #'vEH:`xb #vD4Y hi.i&EΖv#O H4IŶ}:Ikh @tZRF#(tXҙzZ ?I3l7q@õ|ۍ1,GpuY Ꮿ@hJv#xxk$ v#9 5 }_$c S#=+"K{F*m7`#%H:NRSp6I?sIՖ{Ap$I$I:QRv2$Z @UJ*$]<FO4IENDB`