From d880936204b605da7dcb7c1711f42b9f9e8c5030 Mon Sep 17 00:00:00 2001 From: "Garen J. Torikian" Date: Mon, 15 Jul 2024 18:51:23 -0400 Subject: [PATCH 1/6] enforce a higher selma --- html-pipeline.gemspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html-pipeline.gemspec b/html-pipeline.gemspec index c901c7a9..f8d56e63 100644 --- a/html-pipeline.gemspec +++ b/html-pipeline.gemspec @@ -25,7 +25,7 @@ Gem::Specification.new do |gem| "rubygems_mfa_required" => "true", } - gem.add_dependency("selma", "~> 0.1") + gem.add_dependency("selma", "~> 0.4") gem.add_dependency("zeitwerk", "~> 2.5") gem.post_install_message = <<~MSG From b0741631693122c76436d554664b4813b289cebe Mon Sep 17 00:00:00 2001 From: "Garen J. Torikian" Date: Mon, 15 Jul 2024 18:51:30 -0400 Subject: [PATCH 2/6] explain this better --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5f5e5bb1..5b26e0a7 100644 --- a/README.md +++ b/README.md @@ -224,7 +224,7 @@ For more examples of customizing the sanitization process to include the tags yo `NodeFilters`s can operate either on HTML elements or text nodes using CSS selectors. Each `NodeFilter` must define a method named `selector` which provides an instance of `Selma::Selector`. If elements are being manipulated, `handle_element` must be defined, taking one argument, `element`; if text nodes are being manipulated, `handle_text_chunk` must be defined, taking one argument, `text_chunk`. `@config`, and `@result` are available to use, and any changes made to these ivars are passed on to the next filter. -`NodeFilter` also has an optional method, `after_initialize`, which is run after the filter initializes. This can be useful in setting up a custom state for `result` to take advantage of. +`NodeFilter` also has an optional method, `after_initialize`, which is run after the filter initializes. This can be useful in setting up a fresh custom state for `result` to start from each time the pipeline is called. Here's an example `NodeFilter` that adds a base url to images that are root relative: From 933200887a7abe42326041233a7c8e5ec8ff8a64 Mon Sep 17 00:00:00 2001 From: "Garen J. Torikian" Date: Mon, 15 Jul 2024 18:55:41 -0400 Subject: [PATCH 3/6] highlight sanitization in test --- test/sanitization_filter_test.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/sanitization_filter_test.rb b/test/sanitization_filter_test.rb index fd81c04d..aebdae5d 100644 --- a/test/sanitization_filter_test.rb +++ b/test/sanitization_filter_test.rb @@ -232,7 +232,7 @@ def test_sanitization_pipeline_can_be_configured CODE expected = <<~HTML -

This is great, @balevine:

+

This is great, @balevine:

some_code(:first)
         
HTML From b4fc1011d94a732eb8f5eaaf2fc83ce26662e89d Mon Sep 17 00:00:00 2001 From: "Garen J. Torikian" Date: Mon, 15 Jul 2024 21:15:20 -0400 Subject: [PATCH 4/6] stop running sanitization twice --- README.md | 4 ++-- lib/html_pipeline.rb | 19 ++++++++++++------- test/html_pipeline_test.rb | 10 +++++----- test/sanitization_filter_test.rb | 2 +- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 5b26e0a7..0226a4d7 100644 --- a/README.md +++ b/README.md @@ -171,9 +171,9 @@ The `ConvertFilter` takes text and turns it into HTML. `@text`, `@config`, and ` ### Sanitization -Because the web can be a scary place, HTML is automatically sanitized after the `ConvertFilter` runs and before the `NodeFilter`s are processed. This is to prevent malicious or unexpected input from entering the pipeline. +Because the web can be a scary place, **HTML is automatically sanitized** after the `ConvertFilter` runs and before the `NodeFilter`s are processed. This is to prevent malicious or unexpected input from entering the pipeline. -The sanitization process takes a hash configuration of settings. See the [Selma](https://www.github.com/gjtorikian/selma) documentation for more information on how to configure these settings. +The sanitization process takes a hash configuration of settings. See the [Selma](https://www.github.com/gjtorikian/selma) documentation for more information on how to configure these settings. Note that users must correctly configure the sanitization configuration if they expect to use it correctly in conjunction with handlers which manipulate HTML. A default sanitization config is provided by this library (`HTMLPipeline::SanitizationFilter::DEFAULT_CONFIG`). A sample custom sanitization allowlist might look like this: diff --git a/lib/html_pipeline.rb b/lib/html_pipeline.rb index 43975e7e..523ec63c 100644 --- a/lib/html_pipeline.rb +++ b/lib/html_pipeline.rb @@ -175,11 +175,20 @@ def call(text, context: {}, result: {}) end end - unless @node_filters.empty? + rewriter_options = { + memory: { + max_allowed_memory_usage: 5242880, # arbitrary limit of 5MB + }, + } + + if @node_filters.empty? + instrument("sanitization.html_pipeline", payload) do + result[:output] = Selma::Rewriter.new(sanitizer: @sanitization_config, handlers: @node_filters, options: rewriter_options).rewrite(html) + end unless @convert_filter.nil? # no html, so no sanitization + else instrument("call_node_filters.html_pipeline", payload) do @node_filters.each { |filter| filter.context = (filter.context || {}).merge(context) } - result[:output] = Selma::Rewriter.new(sanitizer: @sanitization_config, handlers: @node_filters).rewrite(html) - html = result[:output] + result[:output] = Selma::Rewriter.new(sanitizer: @sanitization_config, handlers: @node_filters, options: rewriter_options).rewrite(html) payload = default_payload({ node_filters: @node_filters.map { |f| f.class.name }, context: context, @@ -188,10 +197,6 @@ def call(text, context: {}, result: {}) end end - instrument("sanitization.html_pipeline", payload) do - result[:output] = Selma::Rewriter.new(sanitizer: @sanitization_config, handlers: @node_filters).rewrite(html) - end - result = result.merge(@node_filters.collect(&:result).reduce({}, :merge)) @node_filters.each(&:reset!) diff --git a/test/html_pipeline_test.rb b/test/html_pipeline_test.rb index 1ef9ee9c..4a797d27 100644 --- a/test/html_pipeline_test.rb +++ b/test/html_pipeline_test.rb @@ -131,7 +131,7 @@ def test_context_is_carried_over_in_call # - yeH is bolded # - strikethroughs are rendered # - mentions are not linked - assert_equal("

yeH! I think @gjtorikian is great!

", result) + assert_equal("

yeH! I think @gjtorikian is great!

", result) context = { bolded: false, @@ -144,7 +144,7 @@ def test_context_is_carried_over_in_call # - yeH is not bolded # - strikethroughs are not rendered # - mentions are linked - assert_equal("

yeH! I think @gjtorikian is ~great~!

", result_with_context) + assert_equal("

yeH! I think @gjtorikian is ~great~!

", result_with_context) end def test_text_filter_instance_context_is_carried_over_in_call @@ -160,7 +160,7 @@ def test_text_filter_instance_context_is_carried_over_in_call # note: # - yeH is not bolded due to previous context - assert_equal("

yeH! I think @gjtorikian is great!

", result) + assert_equal("

yeH! I think @gjtorikian is great!

", result) end def test_convert_filter_instance_context_is_carried_over_in_call @@ -176,7 +176,7 @@ def test_convert_filter_instance_context_is_carried_over_in_call # note: # - strikethroughs are not rendered due to previous context - assert_equal("

yeH! I think @gjtorikian is great!

", result) + assert_equal("

yeH! I think @gjtorikian is great!

", result) end def test_node_filter_instance_context_is_carried_over_in_call @@ -192,6 +192,6 @@ def test_node_filter_instance_context_is_carried_over_in_call # note: # - mentions are linked - assert_equal("

yeH! I think @gjtorikian is great!

", result) + assert_equal("

yeH! I think @gjtorikian is great!

", result) end end diff --git a/test/sanitization_filter_test.rb b/test/sanitization_filter_test.rb index aebdae5d..fd81c04d 100644 --- a/test/sanitization_filter_test.rb +++ b/test/sanitization_filter_test.rb @@ -232,7 +232,7 @@ def test_sanitization_pipeline_can_be_configured CODE expected = <<~HTML -

This is great, @balevine:

+

This is great, @balevine:

some_code(:first)
         
HTML From f52c4319e6e0041dee4821dfe21dcff6f2a9a615 Mon Sep 17 00:00:00 2001 From: "Garen J. Torikian" Date: Mon, 15 Jul 2024 21:15:28 -0400 Subject: [PATCH 5/6] :gem: 3.2.1 --- lib/html_pipeline/version.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/html_pipeline/version.rb b/lib/html_pipeline/version.rb index 16fd6154..7d8e20a4 100644 --- a/lib/html_pipeline/version.rb +++ b/lib/html_pipeline/version.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true class HTMLPipeline - VERSION = "3.2.0" + VERSION = "3.2.1" end From a6d7c0a802d9656512781a386c369a7204de9027 Mon Sep 17 00:00:00 2001 From: "Garen J. Torikian" Date: Mon, 15 Jul 2024 21:18:30 -0400 Subject: [PATCH 6/6] lint --- lib/html_pipeline/convert_filter.rb | 2 +- lib/html_pipeline/convert_filter/markdown_filter.rb | 2 +- lib/html_pipeline/node_filter/syntax_highlight_filter.rb | 2 +- lib/html_pipeline/text_filter.rb | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/html_pipeline/convert_filter.rb b/lib/html_pipeline/convert_filter.rb index a8c08ca7..f85fc28f 100644 --- a/lib/html_pipeline/convert_filter.rb +++ b/lib/html_pipeline/convert_filter.rb @@ -5,7 +5,7 @@ class ConvertFilter < Filter attr_reader :text, :html def initialize(context: {}, result: {}) - super(context: context, result: result) + super end class << self diff --git a/lib/html_pipeline/convert_filter/markdown_filter.rb b/lib/html_pipeline/convert_filter/markdown_filter.rb index 913bfb2f..490d1464 100644 --- a/lib/html_pipeline/convert_filter/markdown_filter.rb +++ b/lib/html_pipeline/convert_filter/markdown_filter.rb @@ -12,7 +12,7 @@ class ConvertFilter < Filter # :markdown[:extensions] => Commonmarker extensions options class MarkdownFilter < ConvertFilter def initialize(context: {}, result: {}) - super(context: context, result: result) + super end # Convert Commonmark to HTML using the best available implementation. diff --git a/lib/html_pipeline/node_filter/syntax_highlight_filter.rb b/lib/html_pipeline/node_filter/syntax_highlight_filter.rb index 2fcda21b..de36d272 100644 --- a/lib/html_pipeline/node_filter/syntax_highlight_filter.rb +++ b/lib/html_pipeline/node_filter/syntax_highlight_filter.rb @@ -15,7 +15,7 @@ class NodeFilter # This filter does not write any additional information to the context hash. class SyntaxHighlightFilter < NodeFilter def initialize(context: {}, result: {}) - super(context: context, result: result) + super # TODO: test the optionality of this @formatter = context[:formatter] || Rouge::Formatters::HTML.new end diff --git a/lib/html_pipeline/text_filter.rb b/lib/html_pipeline/text_filter.rb index 17fd62d4..d43acbde 100644 --- a/lib/html_pipeline/text_filter.rb +++ b/lib/html_pipeline/text_filter.rb @@ -5,7 +5,7 @@ class TextFilter < Filter attr_reader :text def initialize(context: {}, result: {}) - super(context: context, result: result) + super end class << self