xml.js icon indicating copy to clipboard operation
xml.js copied to clipboard

UTF-8 and CJK error

Open nosgnoh opened this issue 6 years ago • 3 comments

Hi Kripken,

I have used your library in my project and see some issue but didn't know this issue belong to your lib or mine. So I log this issue there:

When I validate my xml file using xsd schema with format (utf-8). In xml file I have use some CJK characters and then the result was failed. I research some way to resolve but have no ideas. This is my schema and xml file:

`

<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:usdm="http://usdm.asia/usdm">

<xs:simpleType name="idType">
    <xs:restriction base="xs:string">
        <xs:pattern value="[^A-Z]+"/>
    </xs:restriction>
</xs:simpleType>

<xs:simpleType name="sortType">
    <xs:restriction base="xs:string">
        <xs:pattern value="[^A-Z]+"/>
    </xs:restriction>
</xs:simpleType>

<xs:simpleType name="NOType">
    <xs:restriction base="xs:string">
        <xs:pattern value="[^a-z]*"/>
    </xs:restriction>
</xs:simpleType>

<xs:simpleType name="richcontentType">
    <xs:restriction base="xs:string"/>
</xs:simpleType>

<xs:complexType name="reasonType">
    <xs:sequence>
        <xs:element name="richcontent" type="richcontentType" minOccurs="0" />
    </xs:sequence>
    <xs:attribute name="id" type="idType" use="required"/>
    <xs:attribute name="sort" type="sortType" use="required"/>
    <xs:attribute name="NO" type="NOType" use="required"/>
</xs:complexType>

<xs:complexType name="descType">
    <xs:sequence>
        <xs:element name="richcontent" type="richcontentType" minOccurs="0" />
    </xs:sequence>
    <xs:attribute name="id" type="idType" use="required"/>
    <xs:attribute name="sort" type="sortType" use="required"/>
    <xs:attribute name="NO" type="NOType" use="required"/>
</xs:complexType>

<xs:complexType name="reqspecType">
    <xs:sequence>
        <xs:choice minOccurs="0" maxOccurs="unbounded">
            <xs:element name="group" type="groupType"/>
            <xs:element name="reqspec" type="reqspecType"/>
            <xs:element name="reason" type="reasonType" />
            <xs:element name="desc" type="descType"/>
        </xs:choice>
        <xs:sequence>
            <xs:element name="richcontent" type="richcontentType" minOccurs="0" />
            <xs:choice minOccurs="0" maxOccurs="unbounded">
                <xs:element name="group" type="groupType"/>
                <xs:element name="reqspec" type="reqspecType"/>
                <xs:element name="reason" type="reasonType" />
                <xs:element name="desc" type="descType"/>
            </xs:choice>
        </xs:sequence>
    </xs:sequence>
    <xs:attribute name="id" type="idType" use="required"/>
    <xs:attribute name="sort" type="sortType" use="required"/>
    <xs:attribute name="NO" type="NOType" use="required"/>
</xs:complexType>

<xs:complexType name="groupType">
    <xs:sequence>
        <xs:choice minOccurs="0" maxOccurs="unbounded">
            <xs:element name="group" type="groupType" />
            <xs:element name="reqspec" type="reqspecType"/>
        </xs:choice>
        <xs:sequence>
            <xs:element name="richcontent" type="richcontentType" minOccurs="0" />
            <xs:choice minOccurs="0" maxOccurs="unbounded">
                <xs:element name="group" type="groupType" />
                <xs:element name="reqspec" type="reqspecType"/>
            </xs:choice>
        </xs:sequence>
    </xs:sequence>
    <xs:attribute name="id" type="idType" use="required"/>
    <xs:attribute name="sort" type="sortType" use="required"/>
    <xs:attribute name="NO" type="NOType" use="required"/>
</xs:complexType>

<xs:complexType name="usdmType">
    <xs:sequence>
        <xs:element name="group" type="groupType" minOccurs="0" />
    </xs:sequence>
    <xs:attribute name="version" type="xs:string" use="required"/>
</xs:complexType>

<xs:element name="usdm" type="usdmType"/>

</xs:schema> `

xml : <?xml version="1.0" encoding="utf-8"?> <usdm version="0.0.0" xmlns:usdm="http://usdm.asia/usdm"> <group id="0" sort="0" NO="ROOT.0"> <richcontent>を</richcontent> </group> </usdm>

I realize from this page https://www.utf8-chartable.de/unicode-utf8-table.pl?start=12288&number=512&names=- that the characters begin U+3081 | め | e3 82 81 to the end is failed with utf-8

Thank for your attention!

nosgnoh avatar Dec 13 '18 08:12 nosgnoh

same

kondr1 avatar Jul 17 '19 14:07 kondr1

similar, utf-8 with or without BOM fails on some chars. allowed linebreaks throw error too. error message claim no valid utf-8 is submitted, but chars and line breaks are allowed, so this is buggy

hmuus avatar Apr 16 '20 08:04 hmuus

Similar for Polish characters. Simplified test case:

<?xml version="1.0" encoding="UTF-8"?>
<czytelnicy xsi:noNamespaceSchemaLocation="ImpCz.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  <czytelnicy>Ząb</czytelnicy>
</czytelnicy>

XSD:

<?xml version="1.0" encoding="UTF-8"?>
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified" attributeFormDefault="unqualified">
    <xs:element name="czytelnicy"></xs:element>
</xs:schema>

Error shown on demo page:

file.xml:3: parser error : PCDATA invalid Char value 5
  <czytelnicy>Ząb</czytelnicy>
               ^

Eccenux avatar May 26 '20 16:05 Eccenux