Plaintext tokenizer (#351)

* Use tokenizer for plaintext content * Fix plaintext content test after turning on tokenizer * Use static lexer
pisa-engine · Mar 28, 2020 · a45fcc4 · a45fcc4
1 parent edef570
commit a45fcc4
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 12 deletions.
diff --git a/include/pisa/forward_index_builder.hpp b/include/pisa/forward_index_builder.hpp
@@ -59,11 +59,8 @@ using process_content_function_type =
 
 void parse_plaintext_content(std::string&& content, std::function<void(std::string&&)> process)
 {
-    std::istringstream content_stream(content);
-    std::string term;
-    while (content_stream >> term) {
-        process(std::move(term));
-    }
+    TermTokenizer tokenizer(content);
+    std::for_each(tokenizer.begin(), tokenizer.end(), process);
 }
 
 [[nodiscard]] auto is_http(std::string_view content) -> bool

diff --git a/include/pisa/tokenizer.hpp b/include/pisa/tokenizer.hpp
@@ -31,11 +31,13 @@ struct tokens: lex::lexer<Lexer> {
     }
 };
 
+using token_type =
+    lex::lexertl::token<std::string_view::const_iterator, boost::mpl::vector<>, boost::mpl::false_>;
+using lexer_type = lex::lexertl::actor_lexer<token_type>;
+
 class TermTokenizer {
   public:
-    using token_type =
-        lex::lexertl::token<std::string_view::const_iterator, boost::mpl::vector<>, boost::mpl::false_>;
-    using lexer_type = lex::lexertl::actor_lexer<token_type>;
+    static tokens<lexer_type> const LEXER;
 
     explicit TermTokenizer(std::string_view text)
         : text_(text), first_(text_.begin()), last_(text_.end())
@@ -46,17 +48,18 @@ class TermTokenizer {
         first_ = text_.begin();
         last_ = text_.end();
         return boost::make_transform_iterator(
-            boost::make_filter_iterator(is_valid, lexer_.begin(first_, last_)), transform);
+            boost::make_filter_iterator(is_valid, LEXER.begin(first_, last_)), transform);
     }
 
     [[nodiscard]] auto end()
     {
         return boost::make_transform_iterator(
-            boost::make_filter_iterator(is_valid, lexer_.end()), transform);
+            boost::make_filter_iterator(is_valid, LEXER.end()), transform);
     }
 
   private:
     static bool is_valid(token_type const& tok) { return tok.id() != TokenType::NotValid; }
+
     static std::string transform(token_type const& tok)
     {
         auto& val = tok.value();
@@ -76,7 +79,6 @@ class TermTokenizer {
     std::string_view text_;
     std::string_view::const_iterator first_;
     std::string_view::const_iterator last_;
-    tokens<lexer_type> lexer_{};
 };
 
 }  // namespace pisa
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
@@ -0,0 +1,7 @@
+#include "tokenizer.hpp"
+
+namespace pisa {
+
+tokens<lexer_type> const TermTokenizer::LEXER = tokens<lexer_type>{};
+
+}  // namespace pisa
diff --git a/test/test_forward_index_builder.cpp b/test/test_forward_index_builder.cpp
@@ -1,5 +1,6 @@
 #define CATCH_CONFIG_MAIN
 
+#include <algorithm>
 #include <cstdio>
 #include <string>
 
@@ -365,7 +366,10 @@ TEST_CASE("Build forward index", "[parsing][forward_index][integration]")
                     std::istringstream content_stream(record->content());
                     std::string term;
                     while (content_stream >> term) {
-                        original_body.push_back(std::move(term));
+                        TermTokenizer tok(term);
+                        std::for_each(tok.begin(), tok.end(), [&original_body](auto term) {
+                            original_body.push_back(std::move(term));
+                        });
                     }
                     std::vector<std::string> produced_body;
                     for (auto term_id: *seq_iter) {