Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Spellcheck all enabled scripts and improve URL detection #613

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import android.text.SpannableStringBuilder;
import android.text.TextUtils;
import android.text.style.CharacterStyle;
import helium314.keyboard.latin.settings.Settings;
import helium314.keyboard.latin.utils.Log;
import android.view.KeyEvent;
import android.view.inputmethod.CompletionInfo;
Expand Down Expand Up @@ -58,6 +59,7 @@ public final class RichInputConnection implements PrivateCommandPerformer {
private static final boolean DEBUG_BATCH_NESTING = false;
private static final int NUM_CHARS_TO_GET_BEFORE_CURSOR = 40;
private static final int NUM_CHARS_TO_GET_AFTER_CURSOR = 40;
private static final int NUM_CHARS_TO_GET_ADVANCED_URL = 128;
private static final int INVALID_CURSOR_POSITION = -1;

/**
Expand Down Expand Up @@ -744,18 +746,21 @@ private static boolean isPartOfCompositionForScript(final int codePoint,
public TextRange getWordRangeAtCursor(final SpacingAndPunctuations spacingAndPunctuations,
final String script, final boolean justDeleted) {
mIC = mParent.getCurrentInputConnection();
final boolean urlDetection = Settings.getInstance().getCurrent().mUrlDetectionEnabled;
final int numCharsToGetBeforeCursor = (urlDetection) ? NUM_CHARS_TO_GET_ADVANCED_URL : NUM_CHARS_TO_GET_BEFORE_CURSOR;
final int numCharsToGetAfterCursor = (urlDetection) ? NUM_CHARS_TO_GET_ADVANCED_URL : NUM_CHARS_TO_GET_AFTER_CURSOR;
if (!isConnected()) {
return null;
}
CharSequence before = getTextBeforeCursorAndDetectLaggyConnection(
OPERATION_GET_WORD_RANGE_AT_CURSOR,
SLOW_INPUT_CONNECTION_ON_PARTIAL_RELOAD_MS,
NUM_CHARS_TO_GET_BEFORE_CURSOR,
numCharsToGetBeforeCursor,
InputConnection.GET_TEXT_WITH_STYLES);
final CharSequence after = getTextAfterCursorAndDetectLaggyConnection(
OPERATION_GET_WORD_RANGE_AT_CURSOR,
SLOW_INPUT_CONNECTION_ON_PARTIAL_RELOAD_MS,
NUM_CHARS_TO_GET_AFTER_CURSOR,
numCharsToGetAfterCursor,
InputConnection.GET_TEXT_WITH_STYLES);
if (before == null || after == null) {
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ private TextUtils() {
// large enough for all reasonable purposes, but avoid purposeful attacks. 100k sounds about
// right for this.
public static final int MAX_CHARACTERS_FOR_RECAPITALIZATION = 1024 * 100;
// How many characters can a composed word have before it is assumed to be non-suggestible.
// The length of the lengthy word "internationalization" is 20.
public static final int MAX_CHARACTERS_FOR_SUGGESTIONS = 20;

// Key events coming any faster than this are long-presses.
public static final int LONG_PRESS_MILLISECONDS = 200;
Expand Down
10 changes: 10 additions & 0 deletions app/src/main/java/helium314/keyboard/latin/common/StringUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Locale;
import java.util.regex.Matcher;

public final class StringUtils {

Expand Down Expand Up @@ -418,6 +419,15 @@ public static String capitalizeEachWord(@NonNull final String text,
return builder.toString();
}

// Use regex pattern matching to test if a CharSequence contains a URL.
// Returns the last index of the URL, or -1 if not found.
public static int findURLEndIndex(@NonNull final CharSequence sequence) {
Matcher matcher = android.util.Patterns.WEB_URL.matcher(sequence);
if (matcher.find())
return matcher.end();
return -1;
}

/**
* Approximates whether the text before the cursor looks like a URL.
* <p>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1580,7 +1580,8 @@ public void performUpdateSuggestionStripSync(final SettingsValues settingsValues
return;
}

if (!mWordComposer.isComposingWord() && !settingsValues.mBigramPredictionEnabled) {
if (!mWordComposer.isComposingWord() && !settingsValues.mBigramPredictionEnabled
|| mWordComposer.size() > Constants.MAX_CHARACTERS_FOR_SUGGESTIONS) {
mSuggestionStripViewAccessor.setNeutralSuggestionStrip();
return;
}
Expand Down Expand Up @@ -2336,6 +2337,11 @@ public boolean retryResetCachesAndReturnSuccess(final boolean tryResumeSuggestio
public void getSuggestedWords(final SettingsValues settingsValues,
final Keyboard keyboard, final int keyboardShiftMode, final int inputStyle,
final int sequenceNumber, final OnGetSuggestedWordsCallback callback) {
// don't get suggestions if composed word is very long
if (mWordComposer.size() > Constants.MAX_CHARACTERS_FOR_SUGGESTIONS) {
callback.onGetSuggestedWords(SuggestedWords.getEmptyInstance());
return;
}
mWordComposer.adviseCapitalizedModeBeforeFetchingSuggestions(
getActualCapsMode(settingsValues, keyboardShiftMode));
mSuggest.getSuggestedWords(mWordComposer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import android.provider.UserDictionary.Words;
import android.service.textservice.SpellCheckerService.Session;
import android.text.TextUtils;
import helium314.keyboard.latin.settings.Settings;
import helium314.keyboard.latin.utils.Log;
import android.util.LruCache;
import android.view.inputmethod.InputMethodManager;
Expand Down Expand Up @@ -59,6 +60,7 @@ public abstract class AndroidWordLevelSpellCheckerSession extends Session {
"(\\u0022|\\u0027|\\u0060|\\u00B4|\\u2018|\\u2018|\\u201C|\\u201D)";

private static final Map<String, String> scriptToPunctuationRegexMap = new TreeMap<>();
private List <Locale> localesToCheck;

static {
// TODO: add other non-English language specific punctuation later.
Expand Down Expand Up @@ -118,8 +120,8 @@ public void onChange(boolean self) {

private void updateLocale() {
final String localeString = getLocale();

if (mLocale == null || !mLocale.toString().equals(localeString)) {
if (mLocale == null) return;
if (!mLocale.toString().equals(localeString)) {
final String oldLocale = mLocale == null ? "null" : mLocale.toString();
Log.d(TAG, "Updating locale from " + oldLocale + " to " + localeString);

Expand All @@ -128,10 +130,25 @@ private void updateLocale() {
if (mLocale == null) mScript = ScriptUtils.SCRIPT_UNKNOWN;
else mScript = ScriptUtils.script(mLocale);
}
if (localesToCheck.isEmpty())
localesToCheck.add(mLocale);
else if (!localesToCheck.get(0).equals(mLocale)) {
// Set mLocale to be at the beginning so it is checked first
localesToCheck.add(0, mLocale);
// Make sure no other locales with the same script are checked
for (int i = localesToCheck.size() - 1; i >= 1; i--) {
Locale locale = localesToCheck.get(i);
if (ScriptUtils.script(locale).equals(mScript)) {
localesToCheck.remove(i);
}
}
}
}

@Override
public void onCreate() {
final SharedPreferences prefs = DeviceProtectedUtils.getSharedPreferences(mService);
localesToCheck = SubtypeSettingsKt.getUniqueScriptLocalesFromEnabledSubtypes(prefs);
updateLocale();
}

Expand Down Expand Up @@ -171,58 +188,36 @@ public void onClose() {
}

private static final int CHECKABILITY_CHECKABLE = 0;
private static final int CHECKABILITY_TOO_MANY_NON_LETTERS = 1;
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like to be notified if I wrote "My phone number is0123456789" <- missed a space after "is"

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem I see here is that this might not be wanted by some people (it might have been introduced for a reason), and with the current style of PR I can't just simply revert this change if it turns out clearly unpopular.

private static final int CHECKABILITY_CONTAINS_PERIOD = 2;
private static final int CHECKABILITY_EMAIL_OR_URL = 3;
private static final int CHECKABILITY_FIRST_LETTER_UNCHECKABLE = 4;
private static final int CHECKABILITY_TOO_SHORT = 5;

private static final int CHECKABILITY_CONTAINS_PERIOD = 1;
private static final int CHECKABILITY_EMAIL_OR_URL = 2;

private static final int CHECKABILITY_TOO_SHORT = 3;
/**
* Finds out whether a particular string should be filtered out of spell checking.
* <p>
* This will loosely match URLs, numbers, symbols. To avoid always underlining words that
* we know we will never recognize, this accepts a script identifier that should be one
* of the SCRIPT_* constants defined above, to rule out quickly characters from very
* different languages.
*
* This will match URLs if URL detection is enabled, as well as text that has a period
* or is too short.
* @param text the string to evaluate.
* @param script the identifier for the script this spell checker recognizes
* @return one of the FILTER_OUT_* constants above.
*/
private static int getCheckabilityInScript(final String text, final String script) {
private static int getCheckability(final String text) {
if (TextUtils.isEmpty(text) || text.length() <= 1) return CHECKABILITY_TOO_SHORT;

// TODO: check if an equivalent processing can't be done more quickly with a
// compiled regexp.
// Filter by first letter
final int firstCodePoint = text.codePointAt(0);
// Filter out words that don't start with a letter or an apostrophe
if (!ScriptUtils.isLetterPartOfScript(firstCodePoint, script)
&& '\'' != firstCodePoint) return CHECKABILITY_FIRST_LETTER_UNCHECKABLE;

// Filter contents
final int length = text.length();
int letterCount = 0;
for (int i = 0; i < length; i = text.offsetByCodePoints(i, 1)) {
final int codePoint = text.codePointAt(i);
// Any word containing a COMMERCIAL_AT is probably an e-mail address
// Any word containing a SLASH is probably either an ad-hoc combination of two
// words or a URI - in either case we don't want to spell check that
if (Constants.CODE_COMMERCIAL_AT == codePoint || Constants.CODE_SLASH == codePoint) {
return CHECKABILITY_EMAIL_OR_URL;
}
// If the string contains a period, native returns strange suggestions (it seems
// to return suggestions for everything up to the period only and to ignore the
// rest), so we suppress lookup if there is a period.
// TODO: investigate why native returns these suggestions and remove this code.
if (Constants.CODE_PERIOD == codePoint) {
return CHECKABILITY_CONTAINS_PERIOD;
}
if (ScriptUtils.isLetterPartOfScript(codePoint, script)) ++letterCount;
// Filter out e-mail address and URL
if (Settings.getInstance().getCurrent().mUrlDetectionEnabled && StringUtils.findURLEndIndex(text) != -1) {
return CHECKABILITY_EMAIL_OR_URL;
}
// If the string contains a period, native returns strange suggestions (it seems
// to return suggestions for everything up to the period only and to ignore the
// rest), so we suppress lookup if there is a period.
// TODO: investigate why native returns these suggestions and remove this code.
if (text.indexOf(Constants.CODE_PERIOD) != -1) {
return CHECKABILITY_CONTAINS_PERIOD;
}
// Guestimate heuristic: perform spell checking if at least 3/4 of the characters
// in this word are letters
return (letterCount * 4 < length * 3)
? CHECKABILITY_TOO_MANY_NON_LETTERS : CHECKABILITY_CHECKABLE;
return CHECKABILITY_CHECKABLE;
}

/**
Expand Down Expand Up @@ -275,22 +270,40 @@ protected SuggestionsInfo onGetSuggestionsInternal(
.replaceAll("^" + quotesRegexp, "")
.replaceAll(quotesRegexp + "$", "");

// Find out which locale should be used
boolean foundLocale = false;
for (int i = 0, length = text.length(); !foundLocale && i < length; i++) {
final int codePoint = text.codePointAt(i);
for (Locale locale : localesToCheck) {
String localeScript = ScriptUtils.script(locale);
if (ScriptUtils.isLetterPartOfScript(codePoint, localeScript)) {
if (!mLocale.equals(locale)) {
Log.d(TAG, "Updating locale from " + mLocale.toString() + " to " + locale.toString());
mLocale = locale;
mScript = localeScript;
}
foundLocale = true;
break;
}
}
}
// If no locales were found, then the text probably contains numbers
// or special characters only, so it should not be spell checked.
if (!foundLocale || !mService.hasMainDictionaryForLocale(mLocale)) {
return AndroidSpellCheckerService.getNotInDictEmptySuggestions(false);
}
final String localeRegex = scriptToPunctuationRegexMap.get(ScriptUtils.script(mLocale));

if (localeRegex != null) {
text = text.replaceAll(localeRegex, "");
}

if (!mService.hasMainDictionaryForLocale(mLocale)) {
return AndroidSpellCheckerService.getNotInDictEmptySuggestions(false /* reportAsTypo */);
}

// Handle special patterns like email, URI, telephone number.
final int checkability = getCheckabilityInScript(text, mScript);
// Handle special patterns like email, URI, text with at least one period.
final int checkability = getCheckability(text);
if (CHECKABILITY_CHECKABLE != checkability) {
// CHECKABILITY_CONTAINS_PERIOD Typo should not be reported when text is a valid word followed by a single period (end of sentence).
boolean periodOnlyAtLastIndex = text.indexOf(Constants.CODE_PERIOD) == (text.length() - 1);
if (CHECKABILITY_CONTAINS_PERIOD == checkability) {
boolean periodOnlyAtLastIndex = text.indexOf(Constants.CODE_PERIOD) == (text.length() - 1);
final String[] splitText = text.split(Constants.REGEXP_PERIOD);
boolean allWordsAreValid = true;
// Validate all words on both sides of periods, skip empty tokens due to periods at first/last index
Expand All @@ -306,10 +319,10 @@ protected SuggestionsInfo onGetSuggestionsInternal(
new String[] {
TextUtils.join(Constants.STRING_SPACE, splitText) });
}
return (allWordsAreValid) ? AndroidSpellCheckerService.getInDictEmptySuggestions() :
AndroidSpellCheckerService.getNotInDictEmptySuggestions(!periodOnlyAtLastIndex);
}
return mService.isValidWord(mLocale, text) ?
AndroidSpellCheckerService.getInDictEmptySuggestions() :
AndroidSpellCheckerService.getNotInDictEmptySuggestions(!periodOnlyAtLastIndex);
return AndroidSpellCheckerService.getNotInDictEmptySuggestions(false); // uncheckable so can't be proven to be typo
}

// Handle normal words.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import android.view.textservice.TextInfo;

import helium314.keyboard.latin.common.Constants;
import helium314.keyboard.latin.common.StringUtils;
import helium314.keyboard.latin.settings.Settings;
import helium314.keyboard.latin.settings.SpacingAndPunctuations;
import helium314.keyboard.latin.utils.RunInLocaleKt;

Expand Down Expand Up @@ -71,6 +73,11 @@ public WordIterator(final Resources res, final Locale locale) {
public int getEndOfWord(final CharSequence sequence, final int fromIndex) {
final int length = sequence.length();
int index = fromIndex < 0 ? 0 : Character.offsetByCodePoints(sequence, fromIndex, 1);
if (Settings.getInstance().getCurrent().mUrlDetectionEnabled) {
final int urlEndIndex = StringUtils.findURLEndIndex(sequence.subSequence(index, length));
if (urlEndIndex != - 1)
return urlEndIndex + Character.charCount(Character.codePointAt(sequence, urlEndIndex));
}
while (index < length) {
final int codePoint = Character.codePointAt(sequence, index);
if (mSpacingAndPunctuations.isWordSeparator(codePoint)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,20 @@ fun getEnabledSubtypes(prefs: SharedPreferences, fallback: Boolean = false): Lis
return enabledSubtypes
}

fun getUniqueScriptLocalesFromEnabledSubtypes(prefs: SharedPreferences): List<Locale> {
val subtypes = getEnabledSubtypes(prefs)
val locales = mutableListOf<Locale>()
val scripts = HashSet<String>()
for (subtype in subtypes) {
val locale = subtype.locale()
val script: String = locale.script()
if (scripts.add(script)) {
locales.add(locale)
}
}
return locales
}

fun getAllAvailableSubtypes(): List<InputMethodSubtype> {
require(initialized)
return resourceSubtypesByLocale.values.flatten() + additionalSubtypes
Expand Down