Skip to content

Commit

Permalink
Plaintext tokenizer (#351)
Browse files Browse the repository at this point in the history
* Use tokenizer for plaintext content

* Fix plaintext content test after turning on tokenizer

* Use static lexer
  • Loading branch information
elshize authored Mar 28, 2020
1 parent edef570 commit a45fcc4
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 12 deletions.
7 changes: 2 additions & 5 deletions include/pisa/forward_index_builder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,8 @@ using process_content_function_type =

void parse_plaintext_content(std::string&& content, std::function<void(std::string&&)> process)
{
std::istringstream content_stream(content);
std::string term;
while (content_stream >> term) {
process(std::move(term));
}
TermTokenizer tokenizer(content);
std::for_each(tokenizer.begin(), tokenizer.end(), process);
}

[[nodiscard]] auto is_http(std::string_view content) -> bool
Expand Down
14 changes: 8 additions & 6 deletions include/pisa/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,13 @@ struct tokens: lex::lexer<Lexer> {
}
};

using token_type =
lex::lexertl::token<std::string_view::const_iterator, boost::mpl::vector<>, boost::mpl::false_>;
using lexer_type = lex::lexertl::actor_lexer<token_type>;

class TermTokenizer {
public:
using token_type =
lex::lexertl::token<std::string_view::const_iterator, boost::mpl::vector<>, boost::mpl::false_>;
using lexer_type = lex::lexertl::actor_lexer<token_type>;
static tokens<lexer_type> const LEXER;

explicit TermTokenizer(std::string_view text)
: text_(text), first_(text_.begin()), last_(text_.end())
Expand All @@ -46,17 +48,18 @@ class TermTokenizer {
first_ = text_.begin();
last_ = text_.end();
return boost::make_transform_iterator(
boost::make_filter_iterator(is_valid, lexer_.begin(first_, last_)), transform);
boost::make_filter_iterator(is_valid, LEXER.begin(first_, last_)), transform);
}

[[nodiscard]] auto end()
{
return boost::make_transform_iterator(
boost::make_filter_iterator(is_valid, lexer_.end()), transform);
boost::make_filter_iterator(is_valid, LEXER.end()), transform);
}

private:
static bool is_valid(token_type const& tok) { return tok.id() != TokenType::NotValid; }

static std::string transform(token_type const& tok)
{
auto& val = tok.value();
Expand All @@ -76,7 +79,6 @@ class TermTokenizer {
std::string_view text_;
std::string_view::const_iterator first_;
std::string_view::const_iterator last_;
tokens<lexer_type> lexer_{};
};

} // namespace pisa
7 changes: 7 additions & 0 deletions src/tokenizer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#include "tokenizer.hpp"

namespace pisa {

tokens<lexer_type> const TermTokenizer::LEXER = tokens<lexer_type>{};

} // namespace pisa
6 changes: 5 additions & 1 deletion test/test_forward_index_builder.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#define CATCH_CONFIG_MAIN

#include <algorithm>
#include <cstdio>
#include <string>

Expand Down Expand Up @@ -365,7 +366,10 @@ TEST_CASE("Build forward index", "[parsing][forward_index][integration]")
std::istringstream content_stream(record->content());
std::string term;
while (content_stream >> term) {
original_body.push_back(std::move(term));
TermTokenizer tok(term);
std::for_each(tok.begin(), tok.end(), [&original_body](auto term) {
original_body.push_back(std::move(term));
});
}
std::vector<std::string> produced_body;
for (auto term_id: *seq_iter) {
Expand Down

0 comments on commit a45fcc4

Please sign in to comment.