Make names with special chars searchable.
It's a better fix than I34bfa864, which was only a quick workaround for double
barrelled names.
Now names with other special characters are searchable too.
Also, previously, a query "doublebarrelled" wouldn't match "double-barrelled",
but now it will.
Bug 5592553
Change-Id: Id1d44261f577df7abf701311ed1c86fb093547da
diff --git a/src/com/android/providers/contacts/ContactsDatabaseHelper.java b/src/com/android/providers/contacts/ContactsDatabaseHelper.java
index 4804897..2e43177 100644
--- a/src/com/android/providers/contacts/ContactsDatabaseHelper.java
+++ b/src/com/android/providers/contacts/ContactsDatabaseHelper.java
@@ -108,7 +108,7 @@
* 700-799 Jelly Bean
* </pre>
*/
- static final int DATABASE_VERSION = 702;
+ static final int DATABASE_VERSION = 703;
private static final String DATABASE_NAME = "contacts2.db";
private static final String DATABASE_PRESENCE = "presence_db";
@@ -2385,6 +2385,12 @@
oldVersion = 702;
}
+ if (oldVersion < 703) {
+ // Now names like "L'Image" will be searchable.
+ upgradeSearchIndex = true;
+ oldVersion = 703;
+ }
+
if (upgradeViewsAndTriggers) {
createContactsViews(db);
createGroupsView(db);
diff --git a/src/com/android/providers/contacts/SearchIndexManager.java b/src/com/android/providers/contacts/SearchIndexManager.java
index 5ca9859..bd4e1cc 100644
--- a/src/com/android/providers/contacts/SearchIndexManager.java
+++ b/src/com/android/providers/contacts/SearchIndexManager.java
@@ -20,6 +20,8 @@
import com.android.providers.contacts.ContactsDatabaseHelper.RawContactsColumns;
import com.android.providers.contacts.ContactsDatabaseHelper.SearchIndexColumns;
import com.android.providers.contacts.ContactsDatabaseHelper.Tables;
+import com.google.android.collect.Lists;
+import com.google.common.annotations.VisibleForTesting;
import android.content.ContentValues;
import android.database.Cursor;
@@ -35,7 +37,9 @@
import android.text.TextUtils;
import android.util.Log;
+import java.util.ArrayList;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
@@ -138,7 +142,7 @@
appendContent(value, SEPARATOR_SPACE);
}
- public void appendContent(String value, int format) {
+ private void appendContent(String value, int format) {
if (TextUtils.isEmpty(value)) {
return;
}
@@ -182,18 +186,33 @@
mSbTokens.append(token);
}
- private static final Pattern PATTERN_HYPHEN = Pattern.compile("\\-");
-
public void appendName(String name) {
if (TextUtils.isEmpty(name)) {
return;
}
- if (name.indexOf('-') < 0) {
- // Common case -- no hyphens in it.
- appendNameInternal(name);
- } else {
- // In order to make hyphenated names searchable, let's split names with '-'.
- for (String namePart : PATTERN_HYPHEN.split(name)) {
+ // First, put the original name.
+ appendNameInternal(name);
+
+ // Then, if the name contains more than one FTS token, put each token into the index
+ // too.
+ //
+ // This is to make names with special characters searchable, such as "double-barrelled"
+ // "L'Image".
+ //
+ // Here's how it works:
+ // Because we "normalize" names when putting into the index, if we only put
+ // "double-barrelled", the index will only contain "doublebarrelled".
+ // Now, if the user searches for "double-barrelled", the searcher tokenizes it into
+ // two tokens, "double" and "barrelled". The first one matches "doublebarrelled"
+ // but the second one doesn't (because we only do the prefix match), so
+ // "doublebarrelled" doesn't match.
+ // So, here, we put each token in a name into the index too. In the case above,
+ // we put also "double" and "barrelled".
+ // With this, queries such as "double-barrelled", "double barrelled", "doublebarrelled"
+ // will all match "double-barrelled".
+ final List<String> nameParts = splitIntoFtsTokens(name);
+ if (nameParts.size() > 1) {
+ for (String namePart : nameParts) {
if (!TextUtils.isEmpty(namePart)) {
appendNameInternal(namePart);
}
@@ -201,6 +220,9 @@
}
}
+ /**
+ * Normalize a name and add to {@link #mSbName}
+ */
private void appendNameInternal(String name) {
if (mSbName.length() != 0) {
mSbName.append(' ');
@@ -373,6 +395,29 @@
}
/**
+ * Token separator that matches SQLite's "simple" tokenizer.
+ * - Unicode codepoints >= 128: Everything
+ * - Unicode codepoints < 128: Alphanumeric and "_"
+ * - Everything else is a separator of tokens
+ */
+ private static final Pattern FTS_TOKEN_SEPARATOR_RE =
+ Pattern.compile("[^\u0080-\uffff\\p{Alnum}_]");
+
+ /**
+ * Tokenize a string in the way as that of SQLite's "simple" tokenizer.
+ */
+ @VisibleForTesting
+ static List<String> splitIntoFtsTokens(String s) {
+ final ArrayList<String> ret = Lists.newArrayList();
+ for (String token : FTS_TOKEN_SEPARATOR_RE.split(s)) {
+ if (!TextUtils.isEmpty(token)) {
+ ret.add(token);
+ }
+ }
+ return ret;
+ }
+
+ /**
* Tokenizes the query and normalizes/hex encodes each token. The tokenizer uses the same
* rules as SQLite's "simple" tokenizer. Each token is added to the retokenizer and then
* returned as a String.
@@ -380,35 +425,9 @@
* @see FtsQueryBuilder#SCOPED_NAME_NORMALIZING
*/
public static String getFtsMatchQuery(String query, FtsQueryBuilder ftsQueryBuilder) {
- // SQLite's "simple" tokenizer uses the following rules to detect characters:
- // - Unicode codepoints >= 128: Everything
- // - Unicode codepoints < 128: Alphanumeric and "_"
- // Everything else is a separator of tokens
- int tokenStart = -1;
final StringBuilder result = new StringBuilder();
- for (int i = 0; i <= query.length(); i++) {
- final boolean isChar;
- if (i == query.length()) {
- isChar = false;
- } else {
- final char ch = query.charAt(i);
- if (ch >= 128) {
- isChar = true;
- } else {
- isChar = Character.isLetterOrDigit(ch) || ch == '_';
- }
- }
- if (isChar) {
- if (tokenStart == -1) {
- tokenStart = i;
- }
- } else {
- if (tokenStart != -1) {
- final String token = query.substring(tokenStart, i);
- ftsQueryBuilder.addToken(result, token);
- tokenStart = -1;
- }
- }
+ for (String token : splitIntoFtsTokens(query)) {
+ ftsQueryBuilder.addToken(result, token);
}
return result.toString();
}
diff --git a/tests/src/com/android/providers/contacts/SearchIndexManagerTest.java b/tests/src/com/android/providers/contacts/SearchIndexManagerTest.java
index ed1c23a..3abdc3f 100644
--- a/tests/src/com/android/providers/contacts/SearchIndexManagerTest.java
+++ b/tests/src/com/android/providers/contacts/SearchIndexManagerTest.java
@@ -25,6 +25,7 @@
import android.provider.ContactsContract.CommonDataKinds.StructuredPostal;
import android.provider.ContactsContract.Contacts;
import android.provider.ContactsContract.SearchSnippetColumns;
+import android.test.MoreAsserts;
import android.test.suitebuilder.annotation.MediumTest;
import android.test.suitebuilder.annotation.Suppress;
@@ -377,8 +378,10 @@
assertStoredValue(buildSearchUri("Last-n"), SearchSnippetColumns.SNIPPET, null);
assertStoredValue(buildSearchUri("Last-name"), SearchSnippetColumns.SNIPPET, null);
- // With the current implementation this even works, but this may stop working when we
- // fix the "O'Neill" case below.
+ // This will work too.
+ assertStoredValue(buildSearchUri("Lastname"), SearchSnippetColumns.SNIPPET, null);
+
+ // This doesn't have to work, but it does with the current implementation.
assertStoredValue(buildSearchUri("name"), SearchSnippetColumns.SNIPPET, null);
}
@@ -391,13 +394,11 @@
assertStoredValue(buildSearchUri("Last-"), SearchSnippetColumns.SNIPPET, null);
assertStoredValue(buildSearchUri("Last-n"), SearchSnippetColumns.SNIPPET, null);
assertStoredValue(buildSearchUri("Last-name"), SearchSnippetColumns.SNIPPET, null);
+
+ // This will work too.
+ assertStoredValue(buildSearchUri("Lastname"), SearchSnippetColumns.SNIPPET, null);
}
- /**
- * Probably both "oneill" and "o'neill" should match "o'neill", but at this point only "oneill"
- * works.
- */
- @Suppress
public void testNameWithPunctuations() {
createRawContactWithName("First", "O'Neill");
@@ -467,6 +468,23 @@
SearchSnippetColumns.SNIPPET, "[john@doe.com]");
}
+ public void testSplitIntoFtsTokens() {
+ checkSplitIntoFtsTokens("a", "a");
+ checkSplitIntoFtsTokens("a_b c%d-e'f", "a_b", "c", "d", "e", "f");
+ checkSplitIntoFtsTokens(" ", new String[0]);
+ // There's are all "control" characters, but treated as "letters".
+ // (See http://en.wikipedia.org/wiki/C1_Controls_and_Latin-1_Supplement for what they are)
+ checkSplitIntoFtsTokens("\u0080 \u0081 \u0082", "\u0080", "\u0081", "\u0082");
+
+ // FFF0 is also a token.
+ checkSplitIntoFtsTokens(" \ufff0 ", "\ufff0");
+ }
+
+ private void checkSplitIntoFtsTokens(String input, String... expectedTokens) {
+ MoreAsserts.assertEquals(expectedTokens,
+ SearchIndexManager.splitIntoFtsTokens(input).toArray(new String[0]));
+ }
+
private Uri buildSearchUri(String filter) {
return buildSearchUri(filter, false);
}