jsoup
jsoup copied to clipboard
W3CDom attribute names case sensitivity
According to HTML specification (http://w3c.github.io/html-reference/documents.html#case-insensitivity), both tag and attribute names are case insensitive. However, in current implementation tag names are converted to lower case, but attribute names are left as-is.
Example HTML:
<html lang=en>
<body>
<img src="firstImage.jpg" alt="Alt one" />
<IMG SRC="secondImage.jpg" AlT="Alt two" />
</body>
</html>
will make following test case to fail:
public void checkElementsAttributesCaseSensitivity() throws IOException {
File in = ParseTest.getFile("/htmltests/attributes-case-sensitivity-test.html");
org.jsoup.nodes.Document jsoupDoc;
jsoupDoc = Jsoup.parse(in, "UTF-8");
org.jsoup.helper.W3CDom jDom = new org.jsoup.helper.W3CDom();
Document doc = jDom.fromJsoup(jsoupDoc);
final org.w3c.dom.Element body = (org.w3c.dom.Element) doc.getDocumentElement().getElementsByTagName("body").item(0);
final NodeList imgs = body.getElementsByTagName("img");
assertEquals(2, imgs.getLength());
final org.w3c.dom.Element first = (org.w3c.dom.Element) imgs.item(0);
assertEquals(first.getAttributes().getLength(), 2);
final String img1 = first.getAttribute("src");
assertEquals("firstImage.jpg", img1);
final String alt1 = first.getAttribute("alt");
assertEquals("Alt one", alt1);
final org.w3c.dom.Element second = (org.w3c.dom.Element) imgs.item(1);
assertEquals(second.getAttributes().getLength(), 2);
final String img2 = second.getAttribute("src");
assertEquals("secondImage.jpg", img2);
final String alt2 = second.getAttribute("alt");
assertEquals("Alt two", alt2);
}
Change that will fix that i squite simple:
index 81ac932..281e3d7 100644
--- a/src/main/java/org/jsoup/helper/W3CDom.java
+++ b/src/main/java/org/jsoup/helper/W3CDom.java
@@ -124,7 +124,7 @@ public class W3CDom {
// valid xml attribute names are: ^[a-zA-Z_:][-a-zA-Z0-9_:.]
String key = attribute.getKey().replaceAll("[^-a-zA-Z0-9_:.]", "");
if (key.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*"))
- el.setAttribute(key, attribute.getValue());
+ el.setAttribute(key.toLowerCase(), attribute.getValue());
}
}