Skip to content

Commit

Permalink
Enable ordinal compression for source field
Browse files Browse the repository at this point in the history
  • Loading branch information
mgodwan committed Aug 10, 2023
1 parent 1164221 commit dc6f68c
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.opensearch.common.Nullable;
Expand All @@ -54,11 +55,9 @@
import org.opensearch.search.lookup.SearchLookup;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
* Internal field mapper for storing source (and recovery source)
Expand Down Expand Up @@ -199,15 +198,59 @@ public boolean isComplete() {
return complete;
}


public static Trie ordinals = new Trie();

static {
ordinals.put("\"total_amount\"", (byte) 0);
ordinals.put("\"improvement_surcharge\"", (byte) 1);
ordinals.put("\"pickup_location\"", (byte) 2);
ordinals.put("\"pickup_datetime\"", (byte) 3);
ordinals.put("\"trip_type\"", (byte) 4);
ordinals.put("\"dropoff_datetime\"", (byte) 5);
ordinals.put("\"rate_code_id\"", (byte) 6);
ordinals.put("\"tolls_amount\"", (byte) 7);
ordinals.put("\"dropoff_location\"", (byte) 8);
ordinals.put("\"passenger_count\"", (byte) 9);
ordinals.put("\"fare_amount\"", (byte) 10);
ordinals.put("\"extra\"", (byte) 11);
ordinals.put("\"trip_distance\"", (byte) 12);
ordinals.put("\"tip_amount\"", (byte) 13);
ordinals.put("\"store_and_fwd_flag\"", (byte) 14);
ordinals.put("\"payment_type\"", (byte) 15);
ordinals.put("\"mta_tax\"", (byte) 16);
ordinals.put("\"vendor_id\"", (byte) 17);
}


@Override
public void preParse(ParseContext context) throws IOException {
Trie.Matcher matcher = new Trie.Matcher(ordinals);
BytesReference originalSource = context.sourceToParse().source();
XContentType contentType = context.sourceToParse().getXContentType();
final BytesReference adaptedSource = applyFilters(originalSource, contentType);

if (adaptedSource != null) {
final BytesRef ref = adaptedSource.toBytesRef();
context.doc().add(new StoredField(fieldType().name(), ref.bytes, ref.offset, ref.length));
int index = ref.offset;
byte[] arr = new byte[ref.length];
int len = 0;
for (int i = ref.offset; i < ref.offset + ref.length; i ++) {
byte b = ref.bytes[i];
arr[len ++] = b;
Byte flick = matcher.match(b);
if (flick == null) {
matcher.reset();
} else if (flick > -1) {
len -= matcher.currentLength();
arr[len++] = (byte)'"';
arr[len++] = flick;
arr[len++] = (byte)'"';
} else if (b == '"' && !matcher.isFirstCharacter()){
matcher.reset();
}
}
context.doc().add(new StoredField(fieldType().name(), arr, 0, len));
}

if (originalSource != null && adaptedSource != originalSource) {
Expand All @@ -218,6 +261,12 @@ public void preParse(ParseContext context) throws IOException {
}
}

@Override
public void postParse(ParseContext context) throws IOException {

}


@Nullable
public BytesReference applyFilters(@Nullable BytesReference originalSource, @Nullable XContentType contentType) throws IOException {
if (enabled && originalSource != null) {
Expand Down
81 changes: 81 additions & 0 deletions server/src/main/java/org/opensearch/index/mapper/Trie.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.index.mapper;

import java.util.HashMap;
import java.util.concurrent.atomic.AtomicReference;

public class Trie {

private Node root = new Node();
public Trie() {
}

public void put(String key, byte val) {
Node current = root;
for (int i = 0; i < key.length(); i ++) {
byte bx = (byte) key.charAt(i);
int b = bx + 128;
if (current.children[b] == null) {
current.children[b] = new Node();
}
current = current.children[b];
}
current.val = val;
current.len = key.length();
}

private static class Node {
byte key;
private Node[] children = new Node[255];

byte val = -1;
int len = -1;
}

public static class Matcher {

Node root;
Node current;

Node parent = null;

public Matcher(Trie trie) {
root = current = trie.root;
}

public Byte match(int b) {
b = b + 128;
Node next = current.children[b];
if (next == null) {
current = null;
return null;
}
parent = current;
current = next;
return next.val;
}

public void reset() {
current = root;
}

public boolean inTraversal() {
return current != root;
}

public int currentLength() {
return current.len;
}

public boolean isFirstCharacter() {
return parent == root;
}
}
}

0 comments on commit dc6f68c

Please sign in to comment.