Skip to content

Commit

Permalink
feat: aggregate libxml2 errors into a single exception
Browse files Browse the repository at this point in the history
In places (like document parsing) where it's possible for libxml2 to
emit multiple warnings and errors, aggregate these libxml2 errors into
a single exception so users can see all the problems.

Previously, we were grabbing the most recent "error" which might just
be a warning and not the fatal error preventing parsing from
completing. This was misleading and hid the source of the real
problem.

Now, if there are multiple errors, they're aggregated into a single
Nokogiri::XML::SyntaxError with a message like:

    Multiple errors encountered:
    WARNING: text of warning message
    ERROR: text of error message
    WARNING: text of second warning message

Note that I've also renamed some internal C functions to try to
incrementally get consistent with naming.

Closes #2562
  • Loading branch information
flavorjones committed Jun 24, 2024
1 parent 75ab97d commit 08b59e6
Show file tree
Hide file tree
Showing 15 changed files with 261 additions and 231 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA
* [CRuby] The HTML5 parse methods accept a `:parse_noscript_content_as_text` keyword argument which will emulate the parsing behavior of a browser which has scripting enabled. [#3178, #3231] @stevecheckoway
* [CRuby] `HTML5::DocumentFragment.parse` and `.new` accept a `:context` keyword argument that is the parse context node or element name. Previously this could only be passed in as a positional argument to `.new` and not at all to `.parse`. @flavorjones
* [CRuby] The update to libxml v2.13 improves "in context" fragment parsing recovery. We removed our hacky workaround for recovery that led to silently-degraded functionality when parsing fragments with parse errors. Specifically, malformed XML fragments that used implicit namespace prefixes will now "link up" to the namespaces in the parent document or node, where previously they did not. [#2092] @flavorjones
* [CRuby] When multiple errors could be detected by the parser and there's no obvious document to save them in (for example, when parsing a document with the recovery parse option turned off), the libxml2 errors are aggregated into a single `Nokogiri::XML::SyntaxError`. Previously, only the last error recorded by libxml2 was raised, which might be misleading if it's merely a warning and not the fatal error preventing the operation. [#2562] @flavorjones


### Fixed
Expand Down
4 changes: 2 additions & 2 deletions ext/nokogiri/html4_document.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_enco
const char *c_encoding = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding);
int options = NUM2INT(rb_options);

xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher);
xmlSetStructuredErrorFunc((void *)rb_error_list, noko__error_array_pusher);

c_doc = htmlReadIO(noko_io_read, noko_io_close, (void *)rb_io, c_url, c_encoding, options);

Expand Down Expand Up @@ -106,7 +106,7 @@ rb_html_document_s_read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE r
int html_len = (int)RSTRING_LEN(rb_html);
int options = NUM2INT(rb_options);

xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher);
xmlSetStructuredErrorFunc((void *)rb_error_list, noko__error_array_pusher);

c_doc = htmlReadMemory(c_buffer, html_len, c_url, c_encoding, options);

Expand Down
6 changes: 3 additions & 3 deletions ext/nokogiri/html4_sax_push_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@ native_write(VALUE self, VALUE _chunk, VALUE _last_chunk)
size = (int)RSTRING_LEN(_chunk);
}

Nokogiri_structured_error_func_save_and_set(&handler_state, NULL, NULL);
noko__structured_error_func_save_and_set(&handler_state, NULL, NULL);

status = htmlParseChunk(ctx, chunk, size, Qtrue == _last_chunk ? 1 : 0);

Nokogiri_structured_error_func_restore(&handler_state);
noko__structured_error_func_restore(&handler_state);

if ((status != 0) && !(xmlCtxtGetOptions(ctx) & XML_PARSE_RECOVER)) {
// TODO: there appear to be no tests for this block
xmlErrorConstPtr e = xmlCtxtGetLastError(ctx);
Nokogiri_error_raise(NULL, e);
noko__error_raise(NULL, e);
}

return self;
Expand Down
12 changes: 6 additions & 6 deletions ext/nokogiri/nokogiri.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,13 +232,13 @@ xmlParserCtxtPtr noko_xml_sax_parser_context_unwrap(VALUE rb_context);
# define NOKO_WARN_DEPRECATION(message...) rb_warning(message)
#endif

void Nokogiri_structured_error_func_save(libxmlStructuredErrorHandlerState *handler_state);
void Nokogiri_structured_error_func_save_and_set(libxmlStructuredErrorHandlerState *handler_state, void *user_data,
void noko__structured_error_func_save(libxmlStructuredErrorHandlerState *handler_state);
void noko__structured_error_func_save_and_set(libxmlStructuredErrorHandlerState *handler_state, void *user_data,
xmlStructuredErrorFunc handler);
void Nokogiri_structured_error_func_restore(libxmlStructuredErrorHandlerState *handler_state);
VALUE Nokogiri_wrap_xml_syntax_error(xmlErrorConstPtr error);
void Nokogiri_error_array_pusher(void *ctx, xmlErrorConstPtr error);
NORETURN_DECL void Nokogiri_error_raise(void *ctx, xmlErrorConstPtr error);
void noko__structured_error_func_restore(libxmlStructuredErrorHandlerState *handler_state);
VALUE noko_xml_syntax_error__wrap(xmlErrorConstPtr error);
void noko__error_array_pusher(void *ctx, xmlErrorConstPtr error);
NORETURN_DECL void noko__error_raise(void *ctx, xmlErrorConstPtr error);
void Nokogiri_marshal_xpath_funcall_and_return_values(xmlXPathParserContextPtr ctx, int nargs, VALUE handler,
const char *function_name) ;

Expand Down
196 changes: 94 additions & 102 deletions ext/nokogiri/xml_document.c
Original file line number Diff line number Diff line change
Expand Up @@ -364,49 +364,44 @@ version(VALUE self)
* Create a new document from an IO object
*/
static VALUE
read_io(VALUE klass,
VALUE io,
VALUE url,
VALUE encoding,
VALUE options)
{
const char *c_url = NIL_P(url) ? NULL : StringValueCStr(url);
const char *c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding);
VALUE error_list = rb_ary_new();
VALUE document;
xmlDocPtr doc;

xmlResetLastError();
xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);

doc = xmlReadIO(
(xmlInputReadCallback)noko_io_read,
(xmlInputCloseCallback)noko_io_close,
(void *)io,
c_url,
c_enc,
(int)NUM2INT(options)
);
xmlSetStructuredErrorFunc(NULL, NULL);

if (doc == NULL) {
xmlErrorConstPtr error;

xmlFreeDoc(doc);

error = xmlGetLastError();
if (error) {
rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
noko_xml_document_s_read_io(VALUE rb_class,
VALUE rb_io,
VALUE rb_url,
VALUE rb_encoding,
VALUE rb_options)
{
libxmlStructuredErrorHandlerState handler_state;
VALUE rb_errors = rb_ary_new();

noko__structured_error_func_save_and_set(&handler_state, (void *)rb_errors, noko__error_array_pusher);

const char *c_url = NIL_P(rb_url) ? NULL : StringValueCStr(rb_url);
const char *c_enc = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding);
xmlDocPtr c_document = xmlReadIO(
(xmlInputReadCallback)noko_io_read,
(xmlInputCloseCallback)noko_io_close,
(void *)rb_io,
c_url,
c_enc,
(int)NUM2INT(rb_options)
);

noko__structured_error_func_restore(&handler_state);

if (c_document == NULL) {
xmlFreeDoc(c_document);

VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors);
if (RB_TEST(exception)) {
rb_exc_raise(exception);
} else {
rb_raise(rb_eRuntimeError, "Could not parse document");
}

return Qnil;
}

document = noko_xml_document_wrap(klass, doc);
rb_iv_set(document, "@errors", error_list);
return document;
VALUE rb_document = noko_xml_document_wrap(rb_class, c_document);
rb_iv_set(rb_document, "@errors", rb_errors);
return rb_document;
}

/*
Expand All @@ -416,42 +411,36 @@ read_io(VALUE klass,
* Create a new document from a String
*/
static VALUE
read_memory(VALUE klass,
VALUE string,
VALUE url,
VALUE encoding,
VALUE options)
{
const char *c_buffer = StringValuePtr(string);
const char *c_url = NIL_P(url) ? NULL : StringValueCStr(url);
const char *c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding);
int len = (int)RSTRING_LEN(string);
VALUE error_list = rb_ary_new();
VALUE document;
xmlDocPtr doc;
noko_xml_document_s_read_memory(VALUE rb_class,
VALUE rb_input,
VALUE rb_url,
VALUE rb_encoding,
VALUE rb_options)
{
VALUE rb_errors = rb_ary_new();
xmlSetStructuredErrorFunc((void *)rb_errors, noko__error_array_pusher);

const char *c_buffer = StringValuePtr(rb_input);
const char *c_url = NIL_P(rb_url) ? NULL : StringValueCStr(rb_url);
const char *c_enc = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding);
int c_buffer_len = (int)RSTRING_LEN(rb_input);
xmlDocPtr doc = xmlReadMemory(c_buffer, c_buffer_len, c_url, c_enc, (int)NUM2INT(rb_options));

xmlResetLastError();
xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
doc = xmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options));
xmlSetStructuredErrorFunc(NULL, NULL);

if (doc == NULL) {
xmlErrorConstPtr error;

xmlFreeDoc(doc);

error = xmlGetLastError();
if (error) {
rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors);
if (RB_TEST(exception)) {
rb_exc_raise(exception);
} else {
rb_raise(rb_eRuntimeError, "Could not parse document");
}

return Qnil;
}

document = noko_xml_document_wrap(klass, doc);
rb_iv_set(document, "@errors", error_list);
VALUE document = noko_xml_document_wrap(rb_class, doc);
rb_iv_set(document, "@errors", rb_errors);
return document;
}

Expand Down Expand Up @@ -522,55 +511,58 @@ remove_namespaces_bang(VALUE self)
return self;
}

/* call-seq: doc.create_entity(name, type, external_id, system_id, content)
/* call-seq:
* doc.create_entity(name, type, external_id, system_id, content)
*
* Create a new entity named +name+.
*
* +type+ is an integer representing the type of entity to be created, and it
* defaults to Nokogiri::XML::EntityDecl::INTERNAL_GENERAL. See
* the constants on Nokogiri::XML::EntityDecl for more information.
* +type+ is an integer representing the type of entity to be created, and it defaults to
* +Nokogiri::XML::EntityDecl::INTERNAL_GENERAL+. See the constants on Nokogiri::XML::EntityDecl for
* more information.
*
* +external_id+, +system_id+, and +content+ set the External ID, System ID,
* and content respectively. All of these parameters are optional.
*/
static VALUE
create_entity(int argc, VALUE *argv, VALUE self)
{
VALUE name;
VALUE type;
VALUE external_id;
VALUE system_id;
VALUE content;
xmlEntityPtr ptr;
xmlDocPtr doc ;

doc = noko_xml_document_unwrap(self);

rb_scan_args(argc, argv, "14", &name, &type, &external_id, &system_id,
&content);

xmlResetLastError();
ptr = xmlAddDocEntity(
doc,
(xmlChar *)(NIL_P(name) ? NULL : StringValueCStr(name)),
(int)(NIL_P(type) ? XML_INTERNAL_GENERAL_ENTITY : NUM2INT(type)),
(xmlChar *)(NIL_P(external_id) ? NULL : StringValueCStr(external_id)),
(xmlChar *)(NIL_P(system_id) ? NULL : StringValueCStr(system_id)),
(xmlChar *)(NIL_P(content) ? NULL : StringValueCStr(content))
);

if (NULL == ptr) {
xmlErrorConstPtr error = xmlGetLastError();
if (error) {
rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
noko_xml_document__create_entity(int argc, VALUE *argv, VALUE rb_document)
{
VALUE rb_name;
VALUE rb_type;
VALUE rb_ext_id;
VALUE rb_sys_id;
VALUE rb_content;

rb_scan_args(argc, argv, "14",
&rb_name,
&rb_type, &rb_ext_id, &rb_sys_id, &rb_content);

xmlDocPtr c_document = noko_xml_document_unwrap(rb_document);

libxmlStructuredErrorHandlerState handler_state;
VALUE rb_errors = rb_ary_new();
noko__structured_error_func_save_and_set(&handler_state, (void *)rb_errors, noko__error_array_pusher);

xmlEntityPtr c_entity = xmlAddDocEntity(
c_document,
(xmlChar *)(NIL_P(rb_name) ? NULL : StringValueCStr(rb_name)),
(int)(NIL_P(rb_type) ? XML_INTERNAL_GENERAL_ENTITY : NUM2INT(rb_type)),
(xmlChar *)(NIL_P(rb_ext_id) ? NULL : StringValueCStr(rb_ext_id)),
(xmlChar *)(NIL_P(rb_sys_id) ? NULL : StringValueCStr(rb_sys_id)),
(xmlChar *)(NIL_P(rb_content) ? NULL : StringValueCStr(rb_content))
);

noko__structured_error_func_restore(&handler_state);

if (NULL == c_entity) {
VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors);
if (RB_TEST(exception)) {
rb_exc_raise(exception);
} else {
rb_raise(rb_eRuntimeError, "Could not create entity");
}

return Qnil;
}

return noko_xml_node_wrap(cNokogiriXmlEntityDecl, (xmlNodePtr)ptr);
return noko_xml_node_wrap(cNokogiriXmlEntityDecl, (xmlNodePtr)c_entity);
}

static int
Expand Down Expand Up @@ -773,8 +765,8 @@ noko_init_xml_document(void)

rb_define_alloc_func(cNokogiriXmlDocument, _xml_document_alloc);

rb_define_singleton_method(cNokogiriXmlDocument, "read_memory", read_memory, 4);
rb_define_singleton_method(cNokogiriXmlDocument, "read_io", read_io, 4);
rb_define_singleton_method(cNokogiriXmlDocument, "read_memory", noko_xml_document_s_read_memory, 4);
rb_define_singleton_method(cNokogiriXmlDocument, "read_io", noko_xml_document_s_read_io, 4);
rb_define_singleton_method(cNokogiriXmlDocument, "new", new, -1);

rb_define_method(cNokogiriXmlDocument, "root", rb_xml_document_root, 0);
Expand All @@ -784,7 +776,7 @@ noko_init_xml_document(void)
rb_define_method(cNokogiriXmlDocument, "version", version, 0);
rb_define_method(cNokogiriXmlDocument, "canonicalize", rb_xml_document_canonicalize, -1);
rb_define_method(cNokogiriXmlDocument, "url", url, 0);
rb_define_method(cNokogiriXmlDocument, "create_entity", create_entity, -1);
rb_define_method(cNokogiriXmlDocument, "create_entity", noko_xml_document__create_entity, -1);
rb_define_method(cNokogiriXmlDocument, "remove_namespaces!", remove_namespaces_bang, 0);

rb_define_protected_method(cNokogiriXmlDocument, "initialize_copy_with_args", rb_xml_document_initialize_copy_with_args,
Expand Down
2 changes: 1 addition & 1 deletion ext/nokogiri/xml_dtd.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ validate(VALUE self, VALUE document)

ctxt = xmlNewValidCtxt();

xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
xmlSetStructuredErrorFunc((void *)error_list, noko__error_array_pusher);

xmlValidateDtd(ctxt, doc, dtd);

Expand Down
Loading

0 comments on commit 08b59e6

Please sign in to comment.