Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: aggregate libxml2 errors into a single exception #3257

Merged
merged 1 commit into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA
* [CRuby] The HTML5 parse methods accept a `:parse_noscript_content_as_text` keyword argument which will emulate the parsing behavior of a browser which has scripting enabled. [#3178, #3231] @stevecheckoway
* [CRuby] `HTML5::DocumentFragment.parse` and `.new` accept a `:context` keyword argument that is the parse context node or element name. Previously this could only be passed in as a positional argument to `.new` and not at all to `.parse`. @flavorjones
* [CRuby] The update to libxml v2.13 improves "in context" fragment parsing recovery. We removed our hacky workaround for recovery that led to silently-degraded functionality when parsing fragments with parse errors. Specifically, malformed XML fragments that used implicit namespace prefixes will now "link up" to the namespaces in the parent document or node, where previously they did not. [#2092] @flavorjones
* [CRuby] When multiple errors could be detected by the parser and there's no obvious document to save them in (for example, when parsing a document with the recovery parse option turned off), the libxml2 errors are aggregated into a single `Nokogiri::XML::SyntaxError`. Previously, only the last error recorded by libxml2 was raised, which might be misleading if it's merely a warning and not the fatal error preventing the operation. [#2562] @flavorjones


### Fixed
Expand Down
4 changes: 2 additions & 2 deletions ext/nokogiri/html4_document.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_enco
const char *c_encoding = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding);
int options = NUM2INT(rb_options);

xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher);
xmlSetStructuredErrorFunc((void *)rb_error_list, noko__error_array_pusher);

c_doc = htmlReadIO(noko_io_read, noko_io_close, (void *)rb_io, c_url, c_encoding, options);

Expand Down Expand Up @@ -106,7 +106,7 @@ rb_html_document_s_read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE r
int html_len = (int)RSTRING_LEN(rb_html);
int options = NUM2INT(rb_options);

xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher);
xmlSetStructuredErrorFunc((void *)rb_error_list, noko__error_array_pusher);

c_doc = htmlReadMemory(c_buffer, html_len, c_url, c_encoding, options);

Expand Down
6 changes: 3 additions & 3 deletions ext/nokogiri/html4_sax_push_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@ native_write(VALUE self, VALUE _chunk, VALUE _last_chunk)
size = (int)RSTRING_LEN(_chunk);
}

Nokogiri_structured_error_func_save_and_set(&handler_state, NULL, NULL);
noko__structured_error_func_save_and_set(&handler_state, NULL, NULL);

status = htmlParseChunk(ctx, chunk, size, Qtrue == _last_chunk ? 1 : 0);

Nokogiri_structured_error_func_restore(&handler_state);
noko__structured_error_func_restore(&handler_state);

if ((status != 0) && !(xmlCtxtGetOptions(ctx) & XML_PARSE_RECOVER)) {
// TODO: there appear to be no tests for this block
xmlErrorConstPtr e = xmlCtxtGetLastError(ctx);
Nokogiri_error_raise(NULL, e);
noko__error_raise(NULL, e);
}

return self;
Expand Down
12 changes: 6 additions & 6 deletions ext/nokogiri/nokogiri.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,13 +232,13 @@ xmlParserCtxtPtr noko_xml_sax_parser_context_unwrap(VALUE rb_context);
# define NOKO_WARN_DEPRECATION(message...) rb_warning(message)
#endif

void Nokogiri_structured_error_func_save(libxmlStructuredErrorHandlerState *handler_state);
void Nokogiri_structured_error_func_save_and_set(libxmlStructuredErrorHandlerState *handler_state, void *user_data,
void noko__structured_error_func_save(libxmlStructuredErrorHandlerState *handler_state);
void noko__structured_error_func_save_and_set(libxmlStructuredErrorHandlerState *handler_state, void *user_data,
xmlStructuredErrorFunc handler);
void Nokogiri_structured_error_func_restore(libxmlStructuredErrorHandlerState *handler_state);
VALUE Nokogiri_wrap_xml_syntax_error(xmlErrorConstPtr error);
void Nokogiri_error_array_pusher(void *ctx, xmlErrorConstPtr error);
NORETURN_DECL void Nokogiri_error_raise(void *ctx, xmlErrorConstPtr error);
void noko__structured_error_func_restore(libxmlStructuredErrorHandlerState *handler_state);
VALUE noko_xml_syntax_error__wrap(xmlErrorConstPtr error);
void noko__error_array_pusher(void *ctx, xmlErrorConstPtr error);
NORETURN_DECL void noko__error_raise(void *ctx, xmlErrorConstPtr error);
void Nokogiri_marshal_xpath_funcall_and_return_values(xmlXPathParserContextPtr ctx, int nargs, VALUE handler,
const char *function_name) ;

Expand Down
200 changes: 95 additions & 105 deletions ext/nokogiri/xml_document.c
Original file line number Diff line number Diff line change
Expand Up @@ -364,49 +364,44 @@ version(VALUE self)
* Create a new document from an IO object
*/
static VALUE
read_io(VALUE klass,
VALUE io,
VALUE url,
VALUE encoding,
VALUE options)
{
const char *c_url = NIL_P(url) ? NULL : StringValueCStr(url);
const char *c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding);
VALUE error_list = rb_ary_new();
VALUE document;
xmlDocPtr doc;

xmlResetLastError();
xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);

doc = xmlReadIO(
(xmlInputReadCallback)noko_io_read,
(xmlInputCloseCallback)noko_io_close,
(void *)io,
c_url,
c_enc,
(int)NUM2INT(options)
);
xmlSetStructuredErrorFunc(NULL, NULL);

if (doc == NULL) {
xmlErrorConstPtr error;

xmlFreeDoc(doc);

error = xmlGetLastError();
if (error) {
rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
noko_xml_document_s_read_io(VALUE rb_class,
VALUE rb_io,
VALUE rb_url,
VALUE rb_encoding,
VALUE rb_options)
{
libxmlStructuredErrorHandlerState handler_state;
VALUE rb_errors = rb_ary_new();

noko__structured_error_func_save_and_set(&handler_state, (void *)rb_errors, noko__error_array_pusher);

const char *c_url = NIL_P(rb_url) ? NULL : StringValueCStr(rb_url);
const char *c_enc = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding);
xmlDocPtr c_document = xmlReadIO(
(xmlInputReadCallback)noko_io_read,
(xmlInputCloseCallback)noko_io_close,
(void *)rb_io,
c_url,
c_enc,
(int)NUM2INT(rb_options)
);

noko__structured_error_func_restore(&handler_state);

if (c_document == NULL) {
xmlFreeDoc(c_document);

VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors);
if (RB_TEST(exception)) {
rb_exc_raise(exception);
} else {
rb_raise(rb_eRuntimeError, "Could not parse document");
}

return Qnil;
}

document = noko_xml_document_wrap(klass, doc);
rb_iv_set(document, "@errors", error_list);
return document;
VALUE rb_document = noko_xml_document_wrap(rb_class, c_document);
rb_iv_set(rb_document, "@errors", rb_errors);
return rb_document;
}

/*
Expand All @@ -416,42 +411,34 @@ read_io(VALUE klass,
* Create a new document from a String
*/
static VALUE
read_memory(VALUE klass,
VALUE string,
VALUE url,
VALUE encoding,
VALUE options)
{
const char *c_buffer = StringValuePtr(string);
const char *c_url = NIL_P(url) ? NULL : StringValueCStr(url);
const char *c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding);
int len = (int)RSTRING_LEN(string);
VALUE error_list = rb_ary_new();
VALUE document;
xmlDocPtr doc;

xmlResetLastError();
xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
doc = xmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options));
xmlSetStructuredErrorFunc(NULL, NULL);
noko_xml_document_s_read_memory(VALUE rb_class,
VALUE rb_input,
VALUE rb_url,
VALUE rb_encoding,
VALUE rb_options)
{
VALUE rb_errors = rb_ary_new();
xmlSetStructuredErrorFunc((void *)rb_errors, noko__error_array_pusher);

if (doc == NULL) {
xmlErrorConstPtr error;
const char *c_buffer = StringValuePtr(rb_input);
const char *c_url = NIL_P(rb_url) ? NULL : StringValueCStr(rb_url);
const char *c_enc = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding);
int c_buffer_len = (int)RSTRING_LEN(rb_input);
xmlDocPtr c_document = xmlReadMemory(c_buffer, c_buffer_len, c_url, c_enc, (int)NUM2INT(rb_options));

xmlFreeDoc(doc);
xmlSetStructuredErrorFunc(NULL, NULL);

error = xmlGetLastError();
if (error) {
rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
if (c_document == NULL) {
VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors);
if (RB_TEST(exception)) {
rb_exc_raise(exception);
} else {
rb_raise(rb_eRuntimeError, "Could not parse document");
}

return Qnil;
}

document = noko_xml_document_wrap(klass, doc);
rb_iv_set(document, "@errors", error_list);
VALUE document = noko_xml_document_wrap(rb_class, c_document);
rb_iv_set(document, "@errors", rb_errors);
return document;
}

Expand Down Expand Up @@ -522,55 +509,58 @@ remove_namespaces_bang(VALUE self)
return self;
}

/* call-seq: doc.create_entity(name, type, external_id, system_id, content)
/* call-seq:
* doc.create_entity(name, type, external_id, system_id, content)
*
* Create a new entity named +name+.
*
* +type+ is an integer representing the type of entity to be created, and it
* defaults to Nokogiri::XML::EntityDecl::INTERNAL_GENERAL. See
* the constants on Nokogiri::XML::EntityDecl for more information.
* +type+ is an integer representing the type of entity to be created, and it defaults to
* +Nokogiri::XML::EntityDecl::INTERNAL_GENERAL+. See the constants on Nokogiri::XML::EntityDecl for
* more information.
*
* +external_id+, +system_id+, and +content+ set the External ID, System ID,
* and content respectively. All of these parameters are optional.
*/
static VALUE
create_entity(int argc, VALUE *argv, VALUE self)
{
VALUE name;
VALUE type;
VALUE external_id;
VALUE system_id;
VALUE content;
xmlEntityPtr ptr;
xmlDocPtr doc ;

doc = noko_xml_document_unwrap(self);

rb_scan_args(argc, argv, "14", &name, &type, &external_id, &system_id,
&content);

xmlResetLastError();
ptr = xmlAddDocEntity(
doc,
(xmlChar *)(NIL_P(name) ? NULL : StringValueCStr(name)),
(int)(NIL_P(type) ? XML_INTERNAL_GENERAL_ENTITY : NUM2INT(type)),
(xmlChar *)(NIL_P(external_id) ? NULL : StringValueCStr(external_id)),
(xmlChar *)(NIL_P(system_id) ? NULL : StringValueCStr(system_id)),
(xmlChar *)(NIL_P(content) ? NULL : StringValueCStr(content))
);

if (NULL == ptr) {
xmlErrorConstPtr error = xmlGetLastError();
if (error) {
rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
noko_xml_document__create_entity(int argc, VALUE *argv, VALUE rb_document)
{
VALUE rb_name;
VALUE rb_type;
VALUE rb_ext_id;
VALUE rb_sys_id;
VALUE rb_content;

rb_scan_args(argc, argv, "14",
&rb_name,
&rb_type, &rb_ext_id, &rb_sys_id, &rb_content);

xmlDocPtr c_document = noko_xml_document_unwrap(rb_document);

libxmlStructuredErrorHandlerState handler_state;
VALUE rb_errors = rb_ary_new();
noko__structured_error_func_save_and_set(&handler_state, (void *)rb_errors, noko__error_array_pusher);

xmlEntityPtr c_entity = xmlAddDocEntity(
c_document,
(xmlChar *)(NIL_P(rb_name) ? NULL : StringValueCStr(rb_name)),
(int)(NIL_P(rb_type) ? XML_INTERNAL_GENERAL_ENTITY : NUM2INT(rb_type)),
(xmlChar *)(NIL_P(rb_ext_id) ? NULL : StringValueCStr(rb_ext_id)),
(xmlChar *)(NIL_P(rb_sys_id) ? NULL : StringValueCStr(rb_sys_id)),
(xmlChar *)(NIL_P(rb_content) ? NULL : StringValueCStr(rb_content))
);

noko__structured_error_func_restore(&handler_state);

if (NULL == c_entity) {
VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors);
if (RB_TEST(exception)) {
rb_exc_raise(exception);
} else {
rb_raise(rb_eRuntimeError, "Could not create entity");
}

return Qnil;
}

return noko_xml_node_wrap(cNokogiriXmlEntityDecl, (xmlNodePtr)ptr);
return noko_xml_node_wrap(cNokogiriXmlEntityDecl, (xmlNodePtr)c_entity);
}

static int
Expand Down Expand Up @@ -773,8 +763,8 @@ noko_init_xml_document(void)

rb_define_alloc_func(cNokogiriXmlDocument, _xml_document_alloc);

rb_define_singleton_method(cNokogiriXmlDocument, "read_memory", read_memory, 4);
rb_define_singleton_method(cNokogiriXmlDocument, "read_io", read_io, 4);
rb_define_singleton_method(cNokogiriXmlDocument, "read_memory", noko_xml_document_s_read_memory, 4);
rb_define_singleton_method(cNokogiriXmlDocument, "read_io", noko_xml_document_s_read_io, 4);
rb_define_singleton_method(cNokogiriXmlDocument, "new", new, -1);

rb_define_method(cNokogiriXmlDocument, "root", rb_xml_document_root, 0);
Expand All @@ -784,7 +774,7 @@ noko_init_xml_document(void)
rb_define_method(cNokogiriXmlDocument, "version", version, 0);
rb_define_method(cNokogiriXmlDocument, "canonicalize", rb_xml_document_canonicalize, -1);
rb_define_method(cNokogiriXmlDocument, "url", url, 0);
rb_define_method(cNokogiriXmlDocument, "create_entity", create_entity, -1);
rb_define_method(cNokogiriXmlDocument, "create_entity", noko_xml_document__create_entity, -1);
rb_define_method(cNokogiriXmlDocument, "remove_namespaces!", remove_namespaces_bang, 0);

rb_define_protected_method(cNokogiriXmlDocument, "initialize_copy_with_args", rb_xml_document_initialize_copy_with_args,
Expand Down
2 changes: 1 addition & 1 deletion ext/nokogiri/xml_dtd.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ validate(VALUE self, VALUE document)

ctxt = xmlNewValidCtxt();

xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
xmlSetStructuredErrorFunc((void *)error_list, noko__error_array_pusher);

xmlValidateDtd(ctxt, doc, dtd);

Expand Down
Loading