webstruct icon indicating copy to clipboard operation
webstruct copied to clipboard

to_webannotator may fail if an attribute value of some HTML element contains a control character

Open kmike opened this issue 10 years ago • 0 comments

Traceback (after trying to NER.annotate() https://github.com/scrapinghub/webstruct/blob/master/webstruct_data/corpus/business_pages/source/301.html page):

ValueError                                Traceback (most recent call last)
<ipython-input-8-45ad24ffcda1> in <module>()
      9     try:
     10         with open(fn, 'rb') as f:
---> 11             annotated = ner.annotate(f.read())
     12 
     13         path, filename = os.path.split(fn)

/Users/kmike/svn/webstruct/webstruct/model.pyc in annotate(self, bytes_data, pretty_print)
    105         html_tokens, tags = self.extract_raw(bytes_data)
    106         tree = self.html_tokenizer.detokenize_single(html_tokens, tags)
--> 107         tree = to_webannotator(tree, self.entity_colors)
    108         return tostring(tree, pretty_print=pretty_print)
    109 

/Users/kmike/svn/webstruct/webstruct/webannotator.py in to_webannotator(tree, entity_colors)
    258     """
    259     handler = _WaContentHandler(entity_colors)
--> 260     lxml.sax.saxify(tree, handler)
    261     tree = handler.out.etree
    262     _copy_title(tree)

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in saxify(element_or_tree, content_handler)
    245     them against a SAX ContentHandler.
    246     """
--> 247     return ElementTreeProducer(element_or_tree, content_handler).saxify()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in saxify(self)
    178                 self._recursive_saxify(sibling, {})
    179 
--> 180         self._recursive_saxify(element, {})
    181 
    182         if hasattr(element, 'getnext'):

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    220             content_handler.startPrefixMapping(prefix, uri)
    221         content_handler.startElementNS((ns_uri, local_name),
--> 222                                        qname, sax_attributes)
    223         if element.text:
    224             content_handler.characters(element.text)

/Users/kmike/svn/webstruct/webstruct/webannotator.py in startElementNS(self, name, qname, attributes)
    122         self._closeSpan()
    123         # print('start %s' % qname)
--> 124         self.out.startElementNS(name, qname, attributes)
    125         self._openSpan()
    126 

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in startElementNS(self, ns_name, qname, attributes)
    110         else:
    111             element = SubElement(element_stack[-1], el_name,
--> 112                                  attrs, self._new_mappings)
    113         element_stack.append(element)
    114 

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree.SubElement (src/lxml/lxml.etree.c:67070)()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree._makeSubElement (src/lxml/lxml.etree.c:15492)()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree._makeSubElement (src/lxml/lxml.etree.c:15423)()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree._initNodeAttributes (src/lxml/lxml.etree.c:16529)()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree._addAttributeToNode (src/lxml/lxml.etree.c:16701)()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree._utf8 (src/lxml/lxml.etree.c:26485)()

ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters

kmike avatar May 20 '14 11:05 kmike