forked from smallk/lean
-
Notifications
You must be signed in to change notification settings - Fork 0
/
default_mongo_config.yml
126 lines (94 loc) · 5.51 KB
/
default_mongo_config.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Copyright (c) 2015, Georgia Tech Research Institute
# All rights reserved.
#
# This unpublished material is the property of the Georgia Tech
# Research Institute and is protected under copyright law.
# The methods and techniques described herein are considered
# trade secrets and/or confidential. Reproduction or distribution,
# in whole or in part, is forbidden except by the express written
# permission of the Georgia Tech Research Institute.
# ---
###############################################################################
# #
# GENERAL CONFIGURATION DETAILS #
# #
# Configure DocIndexer to processes Twitter JSON data contained in #
# a mongo database. #
# #
###############################################################################
# [REQUIRED] Specify the full path to the LEAN repository's 'config' folder.
# This folder contains default stopword and spelling files, among other items.
# Items in this folder are loaded automatically at startup.
config_path: /path/to/lean/config
# [REQUIRED] Write the Lucene index to this folder. If this folder does not
# exist it will be created.
outdir: /path/to/index
# [REQUIRED] Specify the analyzer to use. An analyzer consists of a tokenizer
# and a chain of zero or more token filters. The filters perform various
# transformations on the tokens as they pass down the chain. The first four
# analyzers are provided by Lucene and are not customizable; the 'custom'
# analyzers can be easily altered and recompiled.
# org.apache.lucene.analysis.core.WhitespaceAnalyzer Split text on whitespace only
# org.apache.lucene.analysis.standard.StandardAnalyzer Lucene's default text analyzer
# org.apache.lucene.analysis.standard.ClassicAnalyzer Lucene's StandardAnalyzer pre v3.1
# org.apache.lucene.analysis.en.EnglishAnalyzer Lucene's English-specific text analyzer
# analyzers.FormalAnalyzer GTRI analyzer for formal documents
# twitter.TwitterAnalyzer GTRI custom analyzer for Twitter
analyzer: twitter.TwitterAnalyzer
# [REQUIRED FOR MONGO INPUT] access the mongo instance available at this host address
host: localhost
# [REQUIRED FOR MONGO INPUT] access the mongo instance served at this port
port: 27019
# [REQUIRED FOR MONGO INPUT] connect to this database
database: twitter
# [REQUIRED FOR MONGO INPUT] ingest tweets from this collection
collection: ows
# [REQUIRED FOR MONGO INPUT] collect up to this many tweets;
# using a value of -1 will result in no limit applied to the tweets collected
limit: -1
# [OPTIONAL] Specify the absolute path to a 'user' stopword file. Use this
# file to give DocIndexer additional stopwords that should be removed from the
# token stream, but that are not contained in the default stopword list
# (found in the config folder). This file can also be used to remove specific
# tokens that may not be of interest for a given data set. The file name does
# not have to be 'user_stopwords.txt'. To NOT use a user stopword file,
# comment the following line.
#user_stopword_file: /path/to/user_stopwords.txt
# [OPTIONAL] Specify the absolute path to a 'user' spelling file. Use this
# file to give DocIndexer additional spelling corrections that should be
# performed on the token stream, but that are not contained in the default list
# of spelling corrections (found in the config folder). This file can be used
# to remove or 'normalize' domain-specific slang. The file name does not have
# to be 'user_spelling.txt'. To NOT use a user spelling file,
# comment the next line.
#user_spelling_file: /path/to/user_spelling.txt
###############################################################################
# #
# Boolean flags (Yes/No) #
# #
###############################################################################
# [OPTIONAL] Whether to ignore retweets. Retweets have "RT" as the first
# token in the "text" field.
IGNORE_RETWEETS: No
# [OPTIONAL] Whether to ignore hashtags, i.e. tokens such as #justinbieber.
# NOTE: this will ignore only those hashtags identified with the '#' tag by
# the part-of-speech tagger. Hashtags used as proper nouns (^) or in other
# meaningful ways will be kept.
IGNORE_HASHTAGS: No
# [OPTIONAL] Whether to ignore URLs and e-mail addresses. Enabling this flag
# removes all tokens having a 'U' tag.
IGNORE_URLS: No
# [OPTIONAL] Whether to ignore at-mentions, i.e. long strings of numbers that
# begin with an '@' symbol. Enabling this flag removes all tokens having an
# '@' tag.
IGNORE_AT_MENTIONS: No
# [OPTIONAL] Whether to ignore numbers, dates, times, etc. Enabling this flag
# removes all tokens having a '$' tag.
IGNORE_NUMERALS: No
# [OPTIONAL] Whether to disable stemming in the CUSTOM_TWITTER analyzer.
DISABLE_STEMMING: No
# [OPTIONAL] Whether to disable all filters in the CUSTOM_TWITTER analyzer; if
# this option is selected, tokenization is the only operation performed on the
# data. Use this option if you want to see the full set of tokens that emerge
# from the tokenizer.
DISABLE_CUSTOM_FILTERS: No