Make names with special chars searchable. It's a better fix than I34bfa864, which was only a quick workaround for double barrelled names. Now names with other special characters are searchable too. Also, previously, a query "doublebarrelled" wouldn't match "double-barrelled", but now it will. Bug 5592553 Change-Id: Id1d44261f577df7abf701311ed1c86fb093547da

commit: 116d86ddd67330428f9128613b4886fc0ea66221 [log] [tgz]
author: Makoto Onuki <omakoto@google.com> Fri Apr 27 13:59:45 2012 -0700
committer: Makoto Onuki <omakoto@google.com> Mon Apr 30 15:03:08 2012 -0700
tree: 35d01b44ca3c25ab31b61094c43c2fa9af285fdf
parent: f76a0fe0629fb626d96081dc0f272433e4920ba3 [diff]
diff --git a/src/com/android/providers/contacts/ContactsDatabaseHelper.java b/src/com/android/providers/contacts/ContactsDatabaseHelper.java
index 4804897..2e43177 100644
--- a/src/com/android/providers/contacts/ContactsDatabaseHelper.java
+++ b/src/com/android/providers/contacts/ContactsDatabaseHelper.java

@@ -108,7 +108,7 @@
      *   700-799 Jelly Bean
      * </pre>
      */
-    static final int DATABASE_VERSION = 702;
+    static final int DATABASE_VERSION = 703;
 
     private static final String DATABASE_NAME = "contacts2.db";
     private static final String DATABASE_PRESENCE = "presence_db";
@@ -2385,6 +2385,12 @@
             oldVersion = 702;
         }
 
+        if (oldVersion < 703) {
+            // Now names like "L'Image" will be searchable.
+            upgradeSearchIndex = true;
+            oldVersion = 703;
+        }
+
         if (upgradeViewsAndTriggers) {
             createContactsViews(db);
             createGroupsView(db);

diff --git a/src/com/android/providers/contacts/SearchIndexManager.java b/src/com/android/providers/contacts/SearchIndexManager.java
index 5ca9859..bd4e1cc 100644
--- a/src/com/android/providers/contacts/SearchIndexManager.java
+++ b/src/com/android/providers/contacts/SearchIndexManager.java

@@ -20,6 +20,8 @@
 import com.android.providers.contacts.ContactsDatabaseHelper.RawContactsColumns;
 import com.android.providers.contacts.ContactsDatabaseHelper.SearchIndexColumns;
 import com.android.providers.contacts.ContactsDatabaseHelper.Tables;
+import com.google.android.collect.Lists;
+import com.google.common.annotations.VisibleForTesting;
 
 import android.content.ContentValues;
 import android.database.Cursor;
@@ -35,7 +37,9 @@
 import android.text.TextUtils;
 import android.util.Log;
 
+import java.util.ArrayList;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 import java.util.regex.Pattern;
 
@@ -138,7 +142,7 @@
             appendContent(value, SEPARATOR_SPACE);
         }
 
-        public void appendContent(String value, int format) {
+        private void appendContent(String value, int format) {
             if (TextUtils.isEmpty(value)) {
                 return;
             }
@@ -182,18 +186,33 @@
             mSbTokens.append(token);
         }
 
-        private static final Pattern PATTERN_HYPHEN = Pattern.compile("\\-");
-
         public void appendName(String name) {
             if (TextUtils.isEmpty(name)) {
                 return;
             }
-            if (name.indexOf('-') < 0) {
-                // Common case -- no hyphens in it.
-                appendNameInternal(name);
-            } else {
-                // In order to make hyphenated names searchable, let's split names with '-'.
-                for (String namePart : PATTERN_HYPHEN.split(name)) {
+            // First, put the original name.
+            appendNameInternal(name);
+
+            // Then, if the name contains more than one FTS token, put each token into the index
+            // too.
+            //
+            // This is to make names with special characters searchable, such as "double-barrelled"
+            // "L'Image".
+            //
+            // Here's how it works:
+            // Because we "normalize" names when putting into the index, if we only put
+            // "double-barrelled", the index will only contain "doublebarrelled".
+            // Now, if the user searches for "double-barrelled", the searcher tokenizes it into
+            // two tokens, "double" and "barrelled".  The first one matches "doublebarrelled"
+            // but the second one doesn't (because we only do the prefix match), so
+            // "doublebarrelled" doesn't match.
+            // So, here, we put each token in a name into the index too.  In the case above,
+            // we put also "double" and "barrelled".
+            // With this, queries such as "double-barrelled", "double barrelled", "doublebarrelled"
+            // will all match "double-barrelled".
+            final List<String> nameParts = splitIntoFtsTokens(name);
+            if (nameParts.size() > 1) {
+                for (String namePart : nameParts) {
                     if (!TextUtils.isEmpty(namePart)) {
                         appendNameInternal(namePart);
                     }
@@ -201,6 +220,9 @@
             }
         }
 
+        /**
+         * Normalize a name and add to {@link #mSbName}
+         */
         private void appendNameInternal(String name) {
             if (mSbName.length() != 0) {
                 mSbName.append(' ');
@@ -373,6 +395,29 @@
     }
 
     /**
+     * Token separator that matches SQLite's "simple" tokenizer.
+     * - Unicode codepoints >= 128: Everything
+     * - Unicode codepoints < 128: Alphanumeric and "_"
+     * - Everything else is a separator of tokens
+     */
+    private static final Pattern FTS_TOKEN_SEPARATOR_RE =
+            Pattern.compile("[^\u0080-\uffff\\p{Alnum}_]");
+
+    /**
+     * Tokenize a string in the way as that of SQLite's "simple" tokenizer.
+     */
+    @VisibleForTesting
+    static List<String> splitIntoFtsTokens(String s) {
+        final ArrayList<String> ret = Lists.newArrayList();
+        for (String token : FTS_TOKEN_SEPARATOR_RE.split(s)) {
+            if (!TextUtils.isEmpty(token)) {
+                ret.add(token);
+            }
+        }
+        return ret;
+    }
+
+    /**
      * Tokenizes the query and normalizes/hex encodes each token. The tokenizer uses the same
      * rules as SQLite's "simple" tokenizer. Each token is added to the retokenizer and then
      * returned as a String.
@@ -380,35 +425,9 @@
      * @see FtsQueryBuilder#SCOPED_NAME_NORMALIZING
      */
     public static String getFtsMatchQuery(String query, FtsQueryBuilder ftsQueryBuilder) {
-        // SQLite's "simple" tokenizer uses the following rules to detect characters:
-        //  - Unicode codepoints >= 128: Everything
-        //  - Unicode codepoints < 128: Alphanumeric and "_"
-        // Everything else is a separator of tokens
-        int tokenStart = -1;
         final StringBuilder result = new StringBuilder();
-        for (int i = 0; i <= query.length(); i++) {
-            final boolean isChar;
-            if (i == query.length()) {
-                isChar = false;
-            } else {
-                final char ch = query.charAt(i);
-                if (ch >= 128) {
-                    isChar = true;
-                } else {
-                    isChar = Character.isLetterOrDigit(ch) || ch == '_';
-                }
-            }
-            if (isChar) {
-                if (tokenStart == -1) {
-                    tokenStart = i;
-                }
-            } else {
-                if (tokenStart != -1) {
-                    final String token = query.substring(tokenStart, i);
-                    ftsQueryBuilder.addToken(result, token);
-                    tokenStart = -1;
-                }
-            }
+        for (String token : splitIntoFtsTokens(query)) {
+            ftsQueryBuilder.addToken(result, token);
         }
         return result.toString();
     }

diff --git a/tests/src/com/android/providers/contacts/SearchIndexManagerTest.java b/tests/src/com/android/providers/contacts/SearchIndexManagerTest.java
index ed1c23a..3abdc3f 100644
--- a/tests/src/com/android/providers/contacts/SearchIndexManagerTest.java
+++ b/tests/src/com/android/providers/contacts/SearchIndexManagerTest.java

@@ -25,6 +25,7 @@
 import android.provider.ContactsContract.CommonDataKinds.StructuredPostal;
 import android.provider.ContactsContract.Contacts;
 import android.provider.ContactsContract.SearchSnippetColumns;
+import android.test.MoreAsserts;
 import android.test.suitebuilder.annotation.MediumTest;
 import android.test.suitebuilder.annotation.Suppress;
 
@@ -377,8 +378,10 @@
         assertStoredValue(buildSearchUri("Last-n"), SearchSnippetColumns.SNIPPET, null);
         assertStoredValue(buildSearchUri("Last-name"), SearchSnippetColumns.SNIPPET, null);
 
-        // With the current implementation this even works, but this may stop working when we
-        // fix the "O'Neill" case below.
+        // This will work too.
+        assertStoredValue(buildSearchUri("Lastname"), SearchSnippetColumns.SNIPPET, null);
+
+        // This doesn't have to work, but it does with the current implementation.
         assertStoredValue(buildSearchUri("name"), SearchSnippetColumns.SNIPPET, null);
     }
 
@@ -391,13 +394,11 @@
         assertStoredValue(buildSearchUri("Last-"), SearchSnippetColumns.SNIPPET, null);
         assertStoredValue(buildSearchUri("Last-n"), SearchSnippetColumns.SNIPPET, null);
         assertStoredValue(buildSearchUri("Last-name"), SearchSnippetColumns.SNIPPET, null);
+
+        // This will work too.
+        assertStoredValue(buildSearchUri("Lastname"), SearchSnippetColumns.SNIPPET, null);
     }
 
-    /**
-     * Probably both "oneill" and "o'neill" should match "o'neill", but at this point only "oneill"
-     * works.
-     */
-    @Suppress
     public void testNameWithPunctuations() {
         createRawContactWithName("First", "O'Neill");
 
@@ -467,6 +468,23 @@
                 SearchSnippetColumns.SNIPPET, "[john@doe.com]");
     }
 
+    public void testSplitIntoFtsTokens() {
+        checkSplitIntoFtsTokens("a", "a");
+        checkSplitIntoFtsTokens("a_b c%d-e'f", "a_b", "c", "d", "e", "f");
+        checkSplitIntoFtsTokens("  ", new String[0]);
+        // There's are all "control" characters, but treated as "letters".
+        // (See http://en.wikipedia.org/wiki/C1_Controls_and_Latin-1_Supplement for what they are)
+        checkSplitIntoFtsTokens("\u0080 \u0081 \u0082", "\u0080", "\u0081", "\u0082");
+
+        // FFF0 is also a token.
+        checkSplitIntoFtsTokens(" \ufff0  ", "\ufff0");
+    }
+
+    private void checkSplitIntoFtsTokens(String input, String... expectedTokens) {
+        MoreAsserts.assertEquals(expectedTokens,
+                SearchIndexManager.splitIntoFtsTokens(input).toArray(new String[0]));
+    }
+
     private Uri buildSearchUri(String filter) {
         return buildSearchUri(filter, false);
     }
commit	116d86ddd67330428f9128613b4886fc0ea66221	[log] [tgz]
author	Makoto Onuki <omakoto@google.com>	Fri Apr 27 13:59:45 2012 -0700
committer	Makoto Onuki <omakoto@google.com>	Mon Apr 30 15:03:08 2012 -0700
tree	35d01b44ca3c25ab31b61094c43c2fa9af285fdf
parent	f76a0fe0629fb626d96081dc0f272433e4920ba3 [diff]