diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d723481f2..3b28721405 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA * [CRuby] The HTML5 parse methods accept a `:parse_noscript_content_as_text` keyword argument which will emulate the parsing behavior of a browser which has scripting enabled. [#3178, #3231] @stevecheckoway * [CRuby] `HTML5::DocumentFragment.parse` and `.new` accept a `:context` keyword argument that is the parse context node or element name. Previously this could only be passed in as a positional argument to `.new` and not at all to `.parse`. @flavorjones * [CRuby] The update to libxml v2.13 improves "in context" fragment parsing recovery. We removed our hacky workaround for recovery that led to silently-degraded functionality when parsing fragments with parse errors. Specifically, malformed XML fragments that used implicit namespace prefixes will now "link up" to the namespaces in the parent document or node, where previously they did not. [#2092] @flavorjones +* [CRuby] When multiple errors could be detected by the parser and there's no obvious document to save them in (for example, when parsing a document with the recovery parse option turned off), the libxml2 errors are aggregated into a single `Nokogiri::XML::SyntaxError`. Previously, only the last error recorded by libxml2 was raised, which might be misleading if it's merely a warning and not the fatal error preventing the operation. [#2562] @flavorjones ### Fixed diff --git a/ext/nokogiri/html4_document.c b/ext/nokogiri/html4_document.c index a60d5d5586..03b72fb24d 100644 --- a/ext/nokogiri/html4_document.c +++ b/ext/nokogiri/html4_document.c @@ -46,7 +46,7 @@ rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_enco const char *c_encoding = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding); int options = NUM2INT(rb_options); - xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher); + xmlSetStructuredErrorFunc((void *)rb_error_list, noko__error_array_pusher); c_doc = htmlReadIO(noko_io_read, noko_io_close, (void *)rb_io, c_url, c_encoding, options); @@ -106,7 +106,7 @@ rb_html_document_s_read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE r int html_len = (int)RSTRING_LEN(rb_html); int options = NUM2INT(rb_options); - xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher); + xmlSetStructuredErrorFunc((void *)rb_error_list, noko__error_array_pusher); c_doc = htmlReadMemory(c_buffer, html_len, c_url, c_encoding, options); diff --git a/ext/nokogiri/html4_sax_push_parser.c b/ext/nokogiri/html4_sax_push_parser.c index cab3d31756..0398a2d026 100644 --- a/ext/nokogiri/html4_sax_push_parser.c +++ b/ext/nokogiri/html4_sax_push_parser.c @@ -24,16 +24,16 @@ native_write(VALUE self, VALUE _chunk, VALUE _last_chunk) size = (int)RSTRING_LEN(_chunk); } - Nokogiri_structured_error_func_save_and_set(&handler_state, NULL, NULL); + noko__structured_error_func_save_and_set(&handler_state, NULL, NULL); status = htmlParseChunk(ctx, chunk, size, Qtrue == _last_chunk ? 1 : 0); - Nokogiri_structured_error_func_restore(&handler_state); + noko__structured_error_func_restore(&handler_state); if ((status != 0) && !(xmlCtxtGetOptions(ctx) & XML_PARSE_RECOVER)) { // TODO: there appear to be no tests for this block xmlErrorConstPtr e = xmlCtxtGetLastError(ctx); - Nokogiri_error_raise(NULL, e); + noko__error_raise(NULL, e); } return self; diff --git a/ext/nokogiri/nokogiri.h b/ext/nokogiri/nokogiri.h index 95e3ab366f..25ebfcf882 100644 --- a/ext/nokogiri/nokogiri.h +++ b/ext/nokogiri/nokogiri.h @@ -232,13 +232,13 @@ xmlParserCtxtPtr noko_xml_sax_parser_context_unwrap(VALUE rb_context); # define NOKO_WARN_DEPRECATION(message...) rb_warning(message) #endif -void Nokogiri_structured_error_func_save(libxmlStructuredErrorHandlerState *handler_state); -void Nokogiri_structured_error_func_save_and_set(libxmlStructuredErrorHandlerState *handler_state, void *user_data, +void noko__structured_error_func_save(libxmlStructuredErrorHandlerState *handler_state); +void noko__structured_error_func_save_and_set(libxmlStructuredErrorHandlerState *handler_state, void *user_data, xmlStructuredErrorFunc handler); -void Nokogiri_structured_error_func_restore(libxmlStructuredErrorHandlerState *handler_state); -VALUE Nokogiri_wrap_xml_syntax_error(xmlErrorConstPtr error); -void Nokogiri_error_array_pusher(void *ctx, xmlErrorConstPtr error); -NORETURN_DECL void Nokogiri_error_raise(void *ctx, xmlErrorConstPtr error); +void noko__structured_error_func_restore(libxmlStructuredErrorHandlerState *handler_state); +VALUE noko_xml_syntax_error__wrap(xmlErrorConstPtr error); +void noko__error_array_pusher(void *ctx, xmlErrorConstPtr error); +NORETURN_DECL void noko__error_raise(void *ctx, xmlErrorConstPtr error); void Nokogiri_marshal_xpath_funcall_and_return_values(xmlXPathParserContextPtr ctx, int nargs, VALUE handler, const char *function_name) ; diff --git a/ext/nokogiri/xml_document.c b/ext/nokogiri/xml_document.c index 00b9744e5c..e1022f67f5 100644 --- a/ext/nokogiri/xml_document.c +++ b/ext/nokogiri/xml_document.c @@ -364,49 +364,44 @@ version(VALUE self) * Create a new document from an IO object */ static VALUE -read_io(VALUE klass, - VALUE io, - VALUE url, - VALUE encoding, - VALUE options) -{ - const char *c_url = NIL_P(url) ? NULL : StringValueCStr(url); - const char *c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding); - VALUE error_list = rb_ary_new(); - VALUE document; - xmlDocPtr doc; - - xmlResetLastError(); - xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher); - - doc = xmlReadIO( - (xmlInputReadCallback)noko_io_read, - (xmlInputCloseCallback)noko_io_close, - (void *)io, - c_url, - c_enc, - (int)NUM2INT(options) - ); - xmlSetStructuredErrorFunc(NULL, NULL); - - if (doc == NULL) { - xmlErrorConstPtr error; - - xmlFreeDoc(doc); - - error = xmlGetLastError(); - if (error) { - rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error)); +noko_xml_document_s_read_io(VALUE rb_class, + VALUE rb_io, + VALUE rb_url, + VALUE rb_encoding, + VALUE rb_options) +{ + libxmlStructuredErrorHandlerState handler_state; + VALUE rb_errors = rb_ary_new(); + + noko__structured_error_func_save_and_set(&handler_state, (void *)rb_errors, noko__error_array_pusher); + + const char *c_url = NIL_P(rb_url) ? NULL : StringValueCStr(rb_url); + const char *c_enc = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding); + xmlDocPtr c_document = xmlReadIO( + (xmlInputReadCallback)noko_io_read, + (xmlInputCloseCallback)noko_io_close, + (void *)rb_io, + c_url, + c_enc, + (int)NUM2INT(rb_options) + ); + + noko__structured_error_func_restore(&handler_state); + + if (c_document == NULL) { + xmlFreeDoc(c_document); + + VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors); + if (RB_TEST(exception)) { + rb_exc_raise(exception); } else { rb_raise(rb_eRuntimeError, "Could not parse document"); } - - return Qnil; } - document = noko_xml_document_wrap(klass, doc); - rb_iv_set(document, "@errors", error_list); - return document; + VALUE rb_document = noko_xml_document_wrap(rb_class, c_document); + rb_iv_set(rb_document, "@errors", rb_errors); + return rb_document; } /* @@ -416,42 +411,34 @@ read_io(VALUE klass, * Create a new document from a String */ static VALUE -read_memory(VALUE klass, - VALUE string, - VALUE url, - VALUE encoding, - VALUE options) -{ - const char *c_buffer = StringValuePtr(string); - const char *c_url = NIL_P(url) ? NULL : StringValueCStr(url); - const char *c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding); - int len = (int)RSTRING_LEN(string); - VALUE error_list = rb_ary_new(); - VALUE document; - xmlDocPtr doc; - - xmlResetLastError(); - xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher); - doc = xmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options)); - xmlSetStructuredErrorFunc(NULL, NULL); +noko_xml_document_s_read_memory(VALUE rb_class, + VALUE rb_input, + VALUE rb_url, + VALUE rb_encoding, + VALUE rb_options) +{ + VALUE rb_errors = rb_ary_new(); + xmlSetStructuredErrorFunc((void *)rb_errors, noko__error_array_pusher); - if (doc == NULL) { - xmlErrorConstPtr error; + const char *c_buffer = StringValuePtr(rb_input); + const char *c_url = NIL_P(rb_url) ? NULL : StringValueCStr(rb_url); + const char *c_enc = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding); + int c_buffer_len = (int)RSTRING_LEN(rb_input); + xmlDocPtr c_document = xmlReadMemory(c_buffer, c_buffer_len, c_url, c_enc, (int)NUM2INT(rb_options)); - xmlFreeDoc(doc); + xmlSetStructuredErrorFunc(NULL, NULL); - error = xmlGetLastError(); - if (error) { - rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error)); + if (c_document == NULL) { + VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors); + if (RB_TEST(exception)) { + rb_exc_raise(exception); } else { rb_raise(rb_eRuntimeError, "Could not parse document"); } - - return Qnil; } - document = noko_xml_document_wrap(klass, doc); - rb_iv_set(document, "@errors", error_list); + VALUE document = noko_xml_document_wrap(rb_class, c_document); + rb_iv_set(document, "@errors", rb_errors); return document; } @@ -522,55 +509,58 @@ remove_namespaces_bang(VALUE self) return self; } -/* call-seq: doc.create_entity(name, type, external_id, system_id, content) +/* call-seq: + * doc.create_entity(name, type, external_id, system_id, content) * * Create a new entity named +name+. * - * +type+ is an integer representing the type of entity to be created, and it - * defaults to Nokogiri::XML::EntityDecl::INTERNAL_GENERAL. See - * the constants on Nokogiri::XML::EntityDecl for more information. + * +type+ is an integer representing the type of entity to be created, and it defaults to + * +Nokogiri::XML::EntityDecl::INTERNAL_GENERAL+. See the constants on Nokogiri::XML::EntityDecl for + * more information. * * +external_id+, +system_id+, and +content+ set the External ID, System ID, * and content respectively. All of these parameters are optional. */ static VALUE -create_entity(int argc, VALUE *argv, VALUE self) -{ - VALUE name; - VALUE type; - VALUE external_id; - VALUE system_id; - VALUE content; - xmlEntityPtr ptr; - xmlDocPtr doc ; - - doc = noko_xml_document_unwrap(self); - - rb_scan_args(argc, argv, "14", &name, &type, &external_id, &system_id, - &content); - - xmlResetLastError(); - ptr = xmlAddDocEntity( - doc, - (xmlChar *)(NIL_P(name) ? NULL : StringValueCStr(name)), - (int)(NIL_P(type) ? XML_INTERNAL_GENERAL_ENTITY : NUM2INT(type)), - (xmlChar *)(NIL_P(external_id) ? NULL : StringValueCStr(external_id)), - (xmlChar *)(NIL_P(system_id) ? NULL : StringValueCStr(system_id)), - (xmlChar *)(NIL_P(content) ? NULL : StringValueCStr(content)) - ); - - if (NULL == ptr) { - xmlErrorConstPtr error = xmlGetLastError(); - if (error) { - rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error)); +noko_xml_document__create_entity(int argc, VALUE *argv, VALUE rb_document) +{ + VALUE rb_name; + VALUE rb_type; + VALUE rb_ext_id; + VALUE rb_sys_id; + VALUE rb_content; + + rb_scan_args(argc, argv, "14", + &rb_name, + &rb_type, &rb_ext_id, &rb_sys_id, &rb_content); + + xmlDocPtr c_document = noko_xml_document_unwrap(rb_document); + + libxmlStructuredErrorHandlerState handler_state; + VALUE rb_errors = rb_ary_new(); + noko__structured_error_func_save_and_set(&handler_state, (void *)rb_errors, noko__error_array_pusher); + + xmlEntityPtr c_entity = xmlAddDocEntity( + c_document, + (xmlChar *)(NIL_P(rb_name) ? NULL : StringValueCStr(rb_name)), + (int)(NIL_P(rb_type) ? XML_INTERNAL_GENERAL_ENTITY : NUM2INT(rb_type)), + (xmlChar *)(NIL_P(rb_ext_id) ? NULL : StringValueCStr(rb_ext_id)), + (xmlChar *)(NIL_P(rb_sys_id) ? NULL : StringValueCStr(rb_sys_id)), + (xmlChar *)(NIL_P(rb_content) ? NULL : StringValueCStr(rb_content)) + ); + + noko__structured_error_func_restore(&handler_state); + + if (NULL == c_entity) { + VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors); + if (RB_TEST(exception)) { + rb_exc_raise(exception); } else { rb_raise(rb_eRuntimeError, "Could not create entity"); } - - return Qnil; } - return noko_xml_node_wrap(cNokogiriXmlEntityDecl, (xmlNodePtr)ptr); + return noko_xml_node_wrap(cNokogiriXmlEntityDecl, (xmlNodePtr)c_entity); } static int @@ -773,8 +763,8 @@ noko_init_xml_document(void) rb_define_alloc_func(cNokogiriXmlDocument, _xml_document_alloc); - rb_define_singleton_method(cNokogiriXmlDocument, "read_memory", read_memory, 4); - rb_define_singleton_method(cNokogiriXmlDocument, "read_io", read_io, 4); + rb_define_singleton_method(cNokogiriXmlDocument, "read_memory", noko_xml_document_s_read_memory, 4); + rb_define_singleton_method(cNokogiriXmlDocument, "read_io", noko_xml_document_s_read_io, 4); rb_define_singleton_method(cNokogiriXmlDocument, "new", new, -1); rb_define_method(cNokogiriXmlDocument, "root", rb_xml_document_root, 0); @@ -784,7 +774,7 @@ noko_init_xml_document(void) rb_define_method(cNokogiriXmlDocument, "version", version, 0); rb_define_method(cNokogiriXmlDocument, "canonicalize", rb_xml_document_canonicalize, -1); rb_define_method(cNokogiriXmlDocument, "url", url, 0); - rb_define_method(cNokogiriXmlDocument, "create_entity", create_entity, -1); + rb_define_method(cNokogiriXmlDocument, "create_entity", noko_xml_document__create_entity, -1); rb_define_method(cNokogiriXmlDocument, "remove_namespaces!", remove_namespaces_bang, 0); rb_define_protected_method(cNokogiriXmlDocument, "initialize_copy_with_args", rb_xml_document_initialize_copy_with_args, diff --git a/ext/nokogiri/xml_dtd.c b/ext/nokogiri/xml_dtd.c index c8a3024822..d36102057c 100644 --- a/ext/nokogiri/xml_dtd.c +++ b/ext/nokogiri/xml_dtd.c @@ -144,7 +144,7 @@ validate(VALUE self, VALUE document) ctxt = xmlNewValidCtxt(); - xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher); + xmlSetStructuredErrorFunc((void *)error_list, noko__error_array_pusher); xmlValidateDtd(ctxt, doc, dtd); diff --git a/ext/nokogiri/xml_node.c b/ext/nokogiri/xml_node.c index 5c5e4e9c19..da638f6101 100644 --- a/ext/nokogiri/xml_node.c +++ b/ext/nokogiri/xml_node.c @@ -2105,36 +2105,38 @@ compare(VALUE self, VALUE _other) /* * call-seq: - * process_xincludes(options) + * process_xincludes(flags) * * Loads and substitutes all xinclude elements below the node. The - * parser context will be initialized with +options+. + * parser context will be initialized with +flags+. */ static VALUE -process_xincludes(VALUE self, VALUE options) +noko_xml_node__process_xincludes(VALUE rb_node, VALUE rb_flags) { - int rcode ; - xmlNodePtr node; - VALUE error_list = rb_ary_new(); + int status ; + xmlNodePtr c_node; + VALUE rb_errors = rb_ary_new(); + libxmlStructuredErrorHandlerState handler_state; - Noko_Node_Get_Struct(self, xmlNode, node); + Noko_Node_Get_Struct(rb_node, xmlNode, c_node); - xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher); - rcode = xmlXIncludeProcessTreeFlags(node, (int)NUM2INT(options)); - xmlSetStructuredErrorFunc(NULL, NULL); + noko__structured_error_func_save_and_set(&handler_state, (void *)rb_errors, noko__error_array_pusher); - if (rcode < 0) { - xmlErrorConstPtr error; + status = xmlXIncludeProcessTreeFlags(c_node, (int)NUM2INT(rb_flags)); - error = xmlGetLastError(); - if (error) { - rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error)); + noko__structured_error_func_restore(&handler_state); + + if (status < 0) { + VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors); + + if (RB_TEST(exception)) { + rb_exc_raise(exception); } else { rb_raise(rb_eRuntimeError, "Could not perform xinclude substitution"); } } - return self; + return rb_node; } @@ -2156,7 +2158,7 @@ in_context(VALUE self, VALUE _str, VALUE _options) node_children = node->children; doc_children = node->doc->children; - xmlSetStructuredErrorFunc((void *)err, Nokogiri_error_array_pusher); + xmlSetStructuredErrorFunc((void *)err, noko__error_array_pusher); /* This function adds a fake node to the child of +node+. If the parser * does not exit cleanly with XML_ERR_OK, the list is freed. This can @@ -2401,7 +2403,7 @@ noko_init_xml_node(void) rb_define_private_method(cNokogiriXmlNode, "native_write_to", native_write_to, 4); rb_define_private_method(cNokogiriXmlNode, "prepend_newline?", rb_prepend_newline, 0); rb_define_private_method(cNokogiriXmlNode, "html_standard_serialize", html_standard_serialize, 1); - rb_define_private_method(cNokogiriXmlNode, "process_xincludes", process_xincludes, 1); + rb_define_private_method(cNokogiriXmlNode, "process_xincludes", noko_xml_node__process_xincludes, 1); rb_define_private_method(cNokogiriXmlNode, "replace_node", replace, 1); rb_define_private_method(cNokogiriXmlNode, "set", set, 2); rb_define_private_method(cNokogiriXmlNode, "set_namespace", set_namespace, 1); diff --git a/ext/nokogiri/xml_reader.c b/ext/nokogiri/xml_reader.c index b8e93ccbab..a3ddcae078 100644 --- a/ext/nokogiri/xml_reader.c +++ b/ext/nokogiri/xml_reader.c @@ -154,7 +154,7 @@ rb_xml_reader_namespaces(VALUE rb_reader) rb_errors = rb_funcall(rb_reader, rb_intern("errors"), 0); - xmlSetStructuredErrorFunc((void *)rb_errors, Nokogiri_error_array_pusher); + xmlSetStructuredErrorFunc((void *)rb_errors, noko__error_array_pusher); c_node = xmlTextReaderExpand(c_reader); xmlSetStructuredErrorFunc(NULL, NULL); @@ -196,7 +196,7 @@ rb_xml_reader_attribute_hash(VALUE rb_reader) rb_errors = rb_funcall(rb_reader, rb_intern("errors"), 0); - xmlSetStructuredErrorFunc((void *)rb_errors, Nokogiri_error_array_pusher); + xmlSetStructuredErrorFunc((void *)rb_errors, noko__error_array_pusher); c_node = xmlTextReaderExpand(c_reader); xmlSetStructuredErrorFunc(NULL, NULL); @@ -515,44 +515,41 @@ node_type(VALUE self) * Move the Reader forward through the XML document. */ static VALUE -read_more(VALUE self) +read_more(VALUE rb_reader) { - xmlTextReaderPtr reader; - xmlErrorConstPtr error; - VALUE error_list; - int ret; - xmlDocPtr c_document; + xmlTextReaderPtr c_reader; + libxmlStructuredErrorHandlerState handler_state; - TypedData_Get_Struct(self, xmlTextReader, &xml_text_reader_type, reader); + TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_text_reader_type, c_reader); - error_list = rb_funcall(self, rb_intern("errors"), 0); + VALUE rb_errors = rb_funcall(rb_reader, rb_intern("errors"), 0); + noko__structured_error_func_save_and_set(&handler_state, (void *)rb_errors, noko__error_array_pusher); - xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher); - ret = xmlTextReaderRead(reader); - xmlSetStructuredErrorFunc(NULL, NULL); + int status = xmlTextReaderRead(c_reader); + + noko__structured_error_func_restore(&handler_state); - c_document = xmlTextReaderCurrentDoc(reader); + xmlDocPtr c_document = xmlTextReaderCurrentDoc(c_reader); if (c_document && c_document->encoding == NULL) { - VALUE constructor_encoding = rb_iv_get(self, "@encoding"); + VALUE constructor_encoding = rb_iv_get(rb_reader, "@encoding"); if (RTEST(constructor_encoding)) { c_document->encoding = xmlStrdup(BAD_CAST StringValueCStr(constructor_encoding)); } else { - rb_iv_set(self, "@encoding", NOKOGIRI_STR_NEW2("UTF-8")); + rb_iv_set(rb_reader, "@encoding", NOKOGIRI_STR_NEW2("UTF-8")); c_document->encoding = xmlStrdup(BAD_CAST "UTF-8"); } } - if (ret == 1) { return self; } - if (ret == 0) { return Qnil; } + if (status == 1) { return rb_reader; } + if (status == 0) { return Qnil; } - error = xmlGetLastError(); - if (error) { - rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error)); + /* if we're here, there was an error */ + VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors); + if (RB_TEST(exception)) { + rb_exc_raise(exception); } else { - rb_raise(rb_eRuntimeError, "Error pulling: %d", ret); + rb_raise(rb_eRuntimeError, "Error pulling: %d", status); } - - return Qnil; } /* diff --git a/ext/nokogiri/xml_relax_ng.c b/ext/nokogiri/xml_relax_ng.c index 0161115b94..73b3132819 100644 --- a/ext/nokogiri/xml_relax_ng.c +++ b/ext/nokogiri/xml_relax_ng.c @@ -45,7 +45,7 @@ validate_document(VALUE self, VALUE document) xmlRelaxNGSetValidStructuredErrors( valid_ctxt, - Nokogiri_error_array_pusher, + noko__error_array_pusher, (void *)errors ); @@ -58,7 +58,7 @@ validate_document(VALUE self, VALUE document) static VALUE xml_relax_ng_parse_schema( - VALUE klass, + VALUE rb_class, xmlRelaxNGParserCtxtPtr c_parser_context, VALUE rb_parse_options ) @@ -66,6 +66,7 @@ xml_relax_ng_parse_schema( VALUE rb_errors; VALUE rb_schema; xmlRelaxNGPtr c_schema; + libxmlStructuredErrorHandlerState handler_state; if (NIL_P(rb_parse_options)) { rb_parse_options = rb_const_get_at( @@ -75,31 +76,30 @@ xml_relax_ng_parse_schema( } rb_errors = rb_ary_new(); - xmlSetStructuredErrorFunc((void *)rb_errors, Nokogiri_error_array_pusher); + noko__structured_error_func_save_and_set(&handler_state, (void *)rb_errors, noko__error_array_pusher); xmlRelaxNGSetParserStructuredErrors( c_parser_context, - Nokogiri_error_array_pusher, + noko__error_array_pusher, (void *)rb_errors ); c_schema = xmlRelaxNGParse(c_parser_context); - xmlSetStructuredErrorFunc(NULL, NULL); xmlRelaxNGFreeParserCtxt(c_parser_context); + noko__structured_error_func_restore(&handler_state); if (NULL == c_schema) { - xmlErrorConstPtr error = xmlGetLastError(); - if (error) { - Nokogiri_error_raise(NULL, error); + VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors); + + if (RB_TEST(exception)) { + rb_exc_raise(exception); } else { rb_raise(rb_eRuntimeError, "Could not parse document"); } - - return Qnil; } - rb_schema = TypedData_Wrap_Struct(klass, &xml_relax_ng_type, c_schema); + rb_schema = TypedData_Wrap_Struct(rb_class, &xml_relax_ng_type, c_schema); rb_iv_set(rb_schema, "@errors", rb_errors); rb_iv_set(rb_schema, "@parse_options", rb_parse_options); @@ -113,7 +113,7 @@ xml_relax_ng_parse_schema( * Create a new RelaxNG from the contents of +string+ */ static VALUE -read_memory(int argc, VALUE *argv, VALUE klass) +read_memory(int argc, VALUE *argv, VALUE rb_class) { VALUE rb_content; VALUE rb_parse_options; @@ -126,7 +126,7 @@ read_memory(int argc, VALUE *argv, VALUE klass) (int)RSTRING_LEN(rb_content) ); - return xml_relax_ng_parse_schema(klass, c_parser_context, rb_parse_options); + return xml_relax_ng_parse_schema(rb_class, c_parser_context, rb_parse_options); } /* @@ -136,7 +136,7 @@ read_memory(int argc, VALUE *argv, VALUE klass) * Create a new RelaxNG schema from the Nokogiri::XML::Document +doc+ */ static VALUE -from_document(int argc, VALUE *argv, VALUE klass) +from_document(int argc, VALUE *argv, VALUE rb_class) { VALUE rb_document; VALUE rb_parse_options; @@ -150,7 +150,7 @@ from_document(int argc, VALUE *argv, VALUE klass) c_parser_context = xmlRelaxNGNewDocParserCtxt(c_document); - return xml_relax_ng_parse_schema(klass, c_parser_context, rb_parse_options); + return xml_relax_ng_parse_schema(rb_class, c_parser_context, rb_parse_options); } void diff --git a/ext/nokogiri/xml_sax_push_parser.c b/ext/nokogiri/xml_sax_push_parser.c index dd131094e5..d9d9b28d1f 100644 --- a/ext/nokogiri/xml_sax_push_parser.c +++ b/ext/nokogiri/xml_sax_push_parser.c @@ -60,7 +60,7 @@ native_write(VALUE self, VALUE _chunk, VALUE _last_chunk) if (xmlParseChunk(ctx, chunk, size, Qtrue == _last_chunk ? 1 : 0)) { if (!(xmlCtxtGetOptions(ctx) & XML_PARSE_RECOVER)) { xmlErrorConstPtr e = xmlCtxtGetLastError(ctx); - Nokogiri_error_raise(NULL, e); + noko__error_raise(NULL, e); } } diff --git a/ext/nokogiri/xml_schema.c b/ext/nokogiri/xml_schema.c index dfe7543d71..bc2e795cfd 100644 --- a/ext/nokogiri/xml_schema.c +++ b/ext/nokogiri/xml_schema.c @@ -45,7 +45,7 @@ validate_document(VALUE self, VALUE document) xmlSchemaSetValidStructuredErrors( valid_ctxt, - Nokogiri_error_array_pusher, + noko__error_array_pusher, (void *)errors ); @@ -84,7 +84,7 @@ validate_file(VALUE self, VALUE rb_filename) xmlSchemaSetValidStructuredErrors( valid_ctxt, - Nokogiri_error_array_pusher, + noko__error_array_pusher, (void *)errors ); @@ -97,16 +97,13 @@ validate_file(VALUE self, VALUE rb_filename) static VALUE xml_schema_parse_schema( - VALUE klass, + VALUE rb_class, xmlSchemaParserCtxtPtr c_parser_context, VALUE rb_parse_options ) { - VALUE rb_errors; - int parse_options_int; - xmlSchemaPtr c_schema; xmlExternalEntityLoader old_loader = 0; - VALUE rb_schema; + libxmlStructuredErrorHandlerState handler_state; if (NIL_P(rb_parse_options)) { rb_parse_options = rb_const_get_at( @@ -114,43 +111,41 @@ xml_schema_parse_schema( rb_intern("DEFAULT_SCHEMA") ); } + int c_parse_options = (int)NUM2INT(rb_funcall(rb_parse_options, rb_intern("to_i"), 0)); - rb_errors = rb_ary_new(); - xmlSetStructuredErrorFunc((void *)rb_errors, Nokogiri_error_array_pusher); + VALUE rb_errors = rb_ary_new(); + noko__structured_error_func_save_and_set(&handler_state, (void *)rb_errors, noko__error_array_pusher); xmlSchemaSetParserStructuredErrors( c_parser_context, - Nokogiri_error_array_pusher, + noko__error_array_pusher, (void *)rb_errors ); - parse_options_int = (int)NUM2INT(rb_funcall(rb_parse_options, rb_intern("to_i"), 0)); - if (parse_options_int & XML_PARSE_NONET) { + if (c_parse_options & XML_PARSE_NONET) { old_loader = xmlGetExternalEntityLoader(); xmlSetExternalEntityLoader(xmlNoNetExternalEntityLoader); } - c_schema = xmlSchemaParse(c_parser_context); + xmlSchemaPtr c_schema = xmlSchemaParse(c_parser_context); if (old_loader) { xmlSetExternalEntityLoader(old_loader); } - xmlSetStructuredErrorFunc(NULL, NULL); xmlSchemaFreeParserCtxt(c_parser_context); + noko__structured_error_func_restore(&handler_state); if (NULL == c_schema) { - xmlErrorConstPtr error = xmlGetLastError(); - if (error) { - Nokogiri_error_raise(NULL, error); + VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors); + if (RB_TEST(exception)) { + rb_exc_raise(exception); } else { rb_raise(rb_eRuntimeError, "Could not parse document"); } - - return Qnil; } - rb_schema = TypedData_Wrap_Struct(klass, &xml_schema_type, c_schema); + VALUE rb_schema = TypedData_Wrap_Struct(rb_class, &xml_schema_type, c_schema); rb_iv_set(rb_schema, "@errors", rb_errors); rb_iv_set(rb_schema, "@parse_options", rb_parse_options); @@ -169,7 +164,7 @@ xml_schema_parse_schema( * [Returns] Nokogiri::XML::Schema */ static VALUE -read_memory(int argc, VALUE *argv, VALUE klass) +read_memory(int argc, VALUE *argv, VALUE rb_class) { VALUE rb_content; VALUE rb_parse_options; @@ -182,7 +177,7 @@ read_memory(int argc, VALUE *argv, VALUE klass) (int)RSTRING_LEN(rb_content) ); - return xml_schema_parse_schema(klass, c_parser_context, rb_parse_options); + return xml_schema_parse_schema(rb_class, c_parser_context, rb_parse_options); } /* @@ -197,7 +192,7 @@ read_memory(int argc, VALUE *argv, VALUE klass) * [Returns] Nokogiri::XML::Schema */ static VALUE -rb_xml_schema_s_from_document(int argc, VALUE *argv, VALUE klass) +rb_xml_schema_s_from_document(int argc, VALUE *argv, VALUE rb_class) { VALUE rb_document; VALUE rb_parse_options; @@ -230,7 +225,7 @@ rb_xml_schema_s_from_document(int argc, VALUE *argv, VALUE klass) } c_parser_context = xmlSchemaNewDocParserCtxt(c_document); - rb_schema = xml_schema_parse_schema(klass, c_parser_context, rb_parse_options); + rb_schema = xml_schema_parse_schema(rb_class, c_parser_context, rb_parse_options); if (defensive_copy_p) { xmlFreeDoc(c_document); diff --git a/ext/nokogiri/xml_syntax_error.c b/ext/nokogiri/xml_syntax_error.c index 473ccca30e..0ccf43985d 100644 --- a/ext/nokogiri/xml_syntax_error.c +++ b/ext/nokogiri/xml_syntax_error.c @@ -3,7 +3,7 @@ VALUE cNokogiriXmlSyntaxError; void -Nokogiri_structured_error_func_save(libxmlStructuredErrorHandlerState *handler_state) +noko__structured_error_func_save(libxmlStructuredErrorHandlerState *handler_state) { /* this method is tightly coupled to the implementation of xmlSetStructuredErrorFunc */ handler_state->user_data = xmlStructuredErrorContext; @@ -11,36 +11,38 @@ Nokogiri_structured_error_func_save(libxmlStructuredErrorHandlerState *handler_s } void -Nokogiri_structured_error_func_save_and_set(libxmlStructuredErrorHandlerState *handler_state, - void *user_data, - xmlStructuredErrorFunc handler) +noko__structured_error_func_save_and_set( + libxmlStructuredErrorHandlerState *handler_state, + void *user_data, + xmlStructuredErrorFunc handler +) { - Nokogiri_structured_error_func_save(handler_state); + noko__structured_error_func_save(handler_state); xmlSetStructuredErrorFunc(user_data, handler); } void -Nokogiri_structured_error_func_restore(libxmlStructuredErrorHandlerState *handler_state) +noko__structured_error_func_restore(libxmlStructuredErrorHandlerState *handler_state) { xmlSetStructuredErrorFunc(handler_state->user_data, handler_state->handler); } void -Nokogiri_error_array_pusher(void *ctx, xmlErrorConstPtr error) +noko__error_array_pusher(void *ctx, xmlErrorConstPtr error) { VALUE list = (VALUE)ctx; Check_Type(list, T_ARRAY); - rb_ary_push(list, Nokogiri_wrap_xml_syntax_error(error)); + rb_ary_push(list, noko_xml_syntax_error__wrap(error)); } void -Nokogiri_error_raise(void *ctx, xmlErrorConstPtr error) +noko__error_raise(void *ctx, xmlErrorConstPtr error) { - rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error)); + rb_exc_raise(noko_xml_syntax_error__wrap(error)); } VALUE -Nokogiri_wrap_xml_syntax_error(xmlErrorConstPtr error) +noko_xml_syntax_error__wrap(xmlErrorConstPtr error) { VALUE msg, e, klass; diff --git a/ext/nokogiri/xml_xpath_context.c b/ext/nokogiri/xml_xpath_context.c index b6a26a0faa..e9dc675c95 100644 --- a/ext/nokogiri/xml_xpath_context.c +++ b/ext/nokogiri/xml_xpath_context.c @@ -397,7 +397,7 @@ rb_xml_xpath_context_evaluate(int argc, VALUE *argv, VALUE rb_context) ); } - xmlSetStructuredErrorFunc((void *)errors, Nokogiri_error_array_pusher); + xmlSetStructuredErrorFunc((void *)errors, noko__error_array_pusher); xmlSetGenericErrorFunc((void *)errors, generic_exception_pusher); xpath = xmlXPathEvalExpression(query, c_context); diff --git a/lib/nokogiri/xml/syntax_error.rb b/lib/nokogiri/xml/syntax_error.rb index a380407ef9..9db500e9dc 100644 --- a/lib/nokogiri/xml/syntax_error.rb +++ b/lib/nokogiri/xml/syntax_error.rb @@ -6,6 +6,19 @@ module XML # This class provides information about XML SyntaxErrors. These # exceptions are typically stored on Nokogiri::XML::Document#errors. class SyntaxError < ::Nokogiri::SyntaxError + class << self + def aggregate(errors) + return nil if errors.empty? + return errors.first if errors.length == 1 + + messages = ["Multiple errors encountered:"] + errors.each do |error| + messages << error.to_s + end + new(messages.join("\n")) + end + end + attr_reader :domain attr_reader :code attr_reader :level diff --git a/suppressions/ruby.supp b/suppressions/ruby.supp index 5d96afb3f6..2ca582f570 100644 --- a/suppressions/ruby.supp +++ b/suppressions/ruby.supp @@ -97,50 +97,23 @@ fun:xmlXPathEval fun:evaluate } -{ - https://github.com/sparklemotion/nokogiri/actions/runs/5354163940/jobs/9710862134 - # 240 (120 direct, 120 indirect) bytes in 1 blocks are definitely lost in loss record 28,980 of 37,883 - # *xmlNewNodeEatName (tree.c:2299) - # *xmlNewDocNodeEatName (tree.c:2374) - # *xmlSAX2StartElementNs (SAX2.c:2255) - # *xmlParseStartTag2 (parser.c:9658) - # *xmlParseElementStart (parser.c:10043) - # *xmlParseContentInternal (parser.c:9908) - # *xmlParseElement (parser.c:9983) - # *xmlParseDocument (parser.c:10821) - # *xmlDoRead (parser.c:15167) - # *read_memory (xml_document.c:331) - Memcheck:Leak - fun:malloc - fun:objspace_xmalloc0 - ... - fun:xmlNewNodeEatName - fun:xmlNewDocNodeEatName - fun:xmlSAX2StartElementNs - fun:xmlParseStartTag* - fun:xmlParseElementStart - fun:xmlParseContentInternal - fun:xmlParseElement - fun:xmlParseDocument - fun:xmlDoRead - fun:read_memory -} { https://github.com/sparklemotion/nokogiri/actions/runs/4845701723/jobs/8634781681 - # 240 (120 direct, 120 indirect) bytes in 1 blocks are definitely lost in loss record 28,576 of 36,831 - # malloc (vg_replace_malloc.c:381) - # objspace_xmalloc0 (gc.c:12298) - # xmlNewNodeEatName (tree.c:2257) - # xmlNewDocNodeEatName (tree.c:2329) - # xmlSAX2StartElementNs (SAX2.c:2085) - # xmlParseStartTag2 (parser.c:9459) - # xmlParseElementStart (parser.c:9848) - # xmlParseContentInternal (parser.c:9697) - # xmlParseElement (parser.c:9786) - # xmlParseDocument (parser.c:10572) - # xmlCtxtParseDocument (parser.c:13508) - # xmlReadMemory (parser.c:13646) - # *read_memory (xml_document.c:382) + # 240 (120 direct, 120 indirect) bytes in 1 blocks are definitely lost in loss record 408 of 452 + # malloc (at /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) + # objspace_xmalloc0 (gc.c:12617) + # *xmlNewElem (tree.c:2084) + # *xmlNewDocNodeEatName (tree.c:2188) + # *xmlSAX2StartElementNs (SAX2.c:2129) + # *xmlParseStartTag2.constprop.0 (parser.c:9455) + # *xmlParseElementStart (parser.c:9851) + # *xmlParseContentInternal (parser.c:9693) + # *xmlParseElement (parser.c:9789) + # *xmlParseDocument (parser.c:10573) + # *xmlCtxtParseDocument (parser.c:13687) + # *xmlReadMemory (parser.c:13822) + # *noko_xml_document_s_read_memory (xml_document.c:427) + # vm_call_cfunc_with_frame_ (vm_insnhelper.c:3490) Memcheck:Leak fun:malloc fun:objspace_xmalloc0 @@ -148,9 +121,8 @@ fun:xmlNewDocNodeEatName fun:xmlSAX2StartElementNs ... - fun:xmlParseDocument - ... - fun:read_memory + fun:xmlReadMemory + fun:noko_xml_document_s_read_memory } { TODO diff --git a/test/xml/test_syntax_error.rb b/test/xml/test_syntax_error.rb index dcca924697..40709d7190 100644 --- a/test/xml/test_syntax_error.rb +++ b/test/xml/test_syntax_error.rb @@ -2,33 +2,61 @@ require "helper" -module Nokogiri - module XML - class TestSyntaxError < Nokogiri::TestCase - it "#new accepts a message" do +describe Nokogiri::XML::SyntaxError do + it ".new accepts a message" do + error = Nokogiri::XML::SyntaxError.new("hello") + assert_equal "hello", error.message + end + + describe ".aggregate" do + describe "when there are no errors" do + it "returns nil" do + assert_nil(Nokogiri::XML::SyntaxError.aggregate([])) + end + end + + describe "when there is exactly one error" do + it "returns the error" do error = Nokogiri::XML::SyntaxError.new("hello") - assert_equal "hello", error.message + assert_equal(error, Nokogiri::XML::SyntaxError.aggregate([error])) end + end - it "describes the syntax error encountered" do - if Nokogiri.uses_libxml? - bad_doc = Nokogiri::XML("test") - error = bad_doc.errors.first - - assert_equal "1:1: FATAL: Start tag expected, '<' not found", error.message - assert_equal 1, error.line - assert_equal 1, error.column - assert_equal 3, error.level - else - bad_doc = Nokogiri::XML("test") - error = bad_doc.errors.first - - assert_equal "The element type \"root\" must be terminated by the matching end-tag \"\".", error.message - assert_nil error.line - assert_nil error.column - assert_nil error.level - end + describe "when there are multiple errors" do + it "returns a new error with the messages of all errors" do + errors = [ + Nokogiri::XML::SyntaxError.new("hello"), + Nokogiri::XML::SyntaxError.new("there"), + Nokogiri::XML::SyntaxError.new("world"), + ] + aggregate = Nokogiri::XML::SyntaxError.aggregate(errors) + assert_equal(<<~MSG.chomp, aggregate.to_s) + Multiple errors encountered: + hello + there + world + MSG end end end + + it "describes the syntax error encountered" do + if Nokogiri.uses_libxml? + bad_doc = Nokogiri::XML("test") + error = bad_doc.errors.first + + assert_equal "1:1: FATAL: Start tag expected, '<' not found", error.message + assert_equal 1, error.line + assert_equal 1, error.column + assert_equal 3, error.level + else + bad_doc = Nokogiri::XML("test") + error = bad_doc.errors.first + + assert_equal "The element type \"root\" must be terminated by the matching end-tag \"\".", error.message + assert_nil error.line + assert_nil error.column + assert_nil error.level + end + end end