httpx icon indicating copy to clipboard operation
httpx copied to clipboard

Limit which text codecs are supported.

Open lovelydinosaur opened this issue 2 years ago • 1 comments
trafficstars

Closes #2892.

Constrains which text codecs are supported when accessing response.text. (Uses the set currently supported by the Chromium browser.)

Initially prompted by behaviour in https://github.com/encode/httpx/discussions/2881 where a non-text codec "base64" was being loaded and causing an exception. The behaviour after this PR would be to fallback to utf-8 in that case, with safe character replacement. The result would be the base64 encoded string being returned as a string.

lovelydinosaur avatar Oct 19 '23 09:10 lovelydinosaur

The set of encoding names defined in the WHATWG encoding living standard...

https://encoding.spec.whatwg.org/#names-and-labels

User agents have also significantly deviated from the labels listed in the IANA Character Sets registry. To stop spreading legacy encodings further, this specification is exhaustive about the aforementioned details and therefore has no need for the registry.

# UTF-8 "unicode-1-1-utf-8", "unicode11utf8", "unicode20utf8", "utf-8", "utf8", "x-unicode20utf8", # Legacy single-byte encodings # IBM866 "866", "cp866", "csibm866", "ibm866", # ISO-8859-2 "csisolatin2", "iso-8859-2", "iso-ir-101", "iso8859-2", "iso88592", "iso_8859-2", "iso_8859-2:1987", "l2", "latin2", # ISO-8859-3 "csisolatin3", "iso-8859-3", "iso-ir-109", "iso8859-3", "iso88593", "iso_8859-3", "iso_8859-3:1988", "l3", "latin3", # ISO-8859-4 "csisolatin4", "iso-8859-4", "iso-ir-110", "iso8859-4", "iso88594", "iso_8859-4", "iso_8859-4:1988", "l4", "latin4", # ISO-8859-5 "csisolatincyrillic", "cyrillic", "iso-8859-5", "iso-ir-144", "iso8859-5", "iso88595", "iso_8859-5", "iso_8859-5:1988", # ISO-8859-6 "arabic", "asmo-708", "csiso88596e", "csiso88596i", "csisolatinarabic", "ecma-114", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i", "iso-ir-127", "iso8859-6", "iso88596", "iso_8859-6", "iso_8859-6:1987", # ISO-8859-7 "csisolatingreek", "ecma-118", "elot_928", "greek", "greek8", "iso-8859-7", "iso-ir-126", "iso8859-7", "iso88597", "iso_8859-7", "iso_8859-7:1987", "sun_eu_greek", # ISO-8859-8 "csiso88598e", "csisolatinhebrew", "hebrew", "iso-8859-8", "iso-8859-8-e", "iso-ir-138", "iso8859-8", "iso88598", "iso_8859-8", "iso_8859-8:1988", "visual", # ISO-8859-8-I "csiso88598i", "iso-8859-8-i", "logical", # ISO-8859-10 "csisolatin6", "iso-8859-10", "iso-ir-157", "iso8859-10", "iso885910", "l6", "latin6", # ISO-8859-13 "iso-8859-13", "iso8859-13", "iso885913", # ISO-8859-14 "iso-8859-14", "iso8859-14", "iso885914", # ISO-8859-15 "csisolatin9", "iso-8859-15", "iso8859-15", "iso885915", "iso_8859-15", "l9", # ISO-8859-16 "iso-8859-16", # KOI8-R "cskoi8r", "koi", "koi8", "koi8-r", "koi8_r", # KOI8-U "koi8-ru", "koi8-u", # macintosh "csmacintosh", "mac", "macintosh", "x-mac-roman", # windows-874 "dos-874", "iso-8859-11", "iso8859-11", "iso885911", "tis-620", "windows-874", # windows-1250 "cp1250", "windows-1250", "x-cp1250", # windows-1251 "cp1251", "windows-1251", "x-cp1251", # windows-1252 "ansi_x3.4-1968", "ascii", "cp1252", "cp819", "csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", "iso_8859-1", "iso_8859-1:1987", "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252", # windows-1253 "cp1253", "windows-1253", "x-cp1253", # windows-1254 "cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148", "iso8859-9", "iso88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", "windows-1254", "x-cp1254", # windows-1255 "cp1255", "windows-1255", "x-cp1255", # windows-1256 "cp1256", "windows-1256", "x-cp1256", # windows-1257 "cp1257", "windows-1257", "x-cp1257", # windows-1258 "cp1258", "windows-1258", "x-cp1258", # x-mac-cyrillic "x-mac-cyrillic", "x-mac-ukrainian", # Legacy multi-byte Chinese (simplified) encodings # GBK "chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "x-gbk", # gb18030 "gb18030", # Legacy multi-byte Chinese (traditional) encodings # Big5 "big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5", # Legacy multi-byte Japanese encodings # EUC-JP "cseucpkdfmtjapanese", "euc-jp", "x-euc-jp", # ISO-2022-JP "csiso2022jp", "iso-2022-jp", # Shift_JIS "csshiftjis", "ms932", "ms_kanji", "shift-jis", "shift_jis", "sjis", "windows-31j", "x-sjis", # Legacy multi-byte Korean encodings # EUC-KR "cseuckr", "csksc56011987", "euc-kr", "iso-ir-149", "korean", "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "windows-949", # Legacy miscellaneous encodings # replacement "csiso2022kr", "hz-gb-2312", "iso-2022-cn", "iso-2022-cn-ext", "iso-2022-kr", "replacement", # UTF-16BE "unicodefffe", "utf-16be", # UTF-16LE "csunicode", "iso-10646-ucs-2", "ucs-2", "unicode", "unicodefeff", "utf-16", "utf-16le", # x-user-defined "x-user-defined",

Then...

>>> names
>>> for encoding in whatwg:
...     try:
...         codec = codecs.lookup(encoding)
...         names.append(codec.name)
...     except:
...         pass
>>> print(sorted(set(names)))
'ascii',
'big5',
'big5hkscs',
'cp1250',
'cp1251',
'cp1252',
'cp1253',
'cp1254',
'cp1255',
'cp1256',
'cp1257',
'cp1258',
'cp866',
'cp932',
'euc_jp',
'euc_kr',
'gb18030',
'gb2312',
'gbk',
'hz',
'iso2022_jp',
'iso2022_kr',
'iso8859-1',
'iso8859-10',
'iso8859-11',
'iso8859-13',
'iso8859-14',
'iso8859-15',
'iso8859-16',
'iso8859-2',
'iso8859-3',
'iso8859-4',
'iso8859-5',
'iso8859-6',
'iso8859-7',
'iso8859-8',
'iso8859-9',
'koi8-r',
'koi8-u',
'mac-roman',
'shift_jis',
'tis-620',
'utf-16',
'utf-16-be',
'utf-16-le',
'utf-8',

lovelydinosaur avatar Oct 28 '24 17:10 lovelydinosaur