Use bfrange to shrink ToUnicode table.

Patch from Arthur Hsu.  Original CL: http://codereview.appspot.com/4844043/
BUG=258

Review URL: http://codereview.appspot.com/4808083

git-svn-id: http://skia.googlecode.com/svn/trunk@2075 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/gyp/tests.gyp b/gyp/tests.gyp
index 9bde66f..25fffd1 100644
--- a/gyp/tests.gyp
+++ b/gyp/tests.gyp
@@ -53,6 +53,7 @@
         '../tests/StringTest.cpp',
         '../tests/Test.cpp',
         '../tests/TestSize.cpp',
+	'../tests/ToUnicode.cpp',
         '../tests/UtilsTest.cpp',
         '../tests/Writer32Test.cpp',
         '../tests/XfermodeTest.cpp',
diff --git a/src/pdf/SkPDFFont.cpp b/src/pdf/SkPDFFont.cpp
index 1333c9f..ecee502 100644
--- a/src/pdf/SkPDFFont.cpp
+++ b/src/pdf/SkPDFFont.cpp
@@ -361,21 +361,6 @@
     cmap->writeText(kTypeInfo);
 }
 
-static void append_cmap_bfchar_table(uint16_t* glyph_id, SkUnichar* unicode,
-                                     size_t count,
-                                     SkDynamicMemoryWStream* cmap) {
-    cmap->writeDecAsText(count);
-    cmap->writeText(" beginbfchar\n");
-    for (size_t i = 0; i < count; ++i) {
-        cmap->writeText("<");
-        cmap->writeHexAsText(glyph_id[i], 4);
-        cmap->writeText("> <");
-        cmap->writeHexAsText(unicode[i], 4);
-        cmap->writeText(">\n");
-    }
-    cmap->writeText("endbfchar\n");
-}
-
 static void append_cmap_footer(SkDynamicMemoryWStream* cmap) {
     const char* kFooter =
         "endcmap\n"
@@ -385,38 +370,150 @@
     cmap->writeText(kFooter);
 }
 
-// Generate <bfchar> table according to PDF spec 1.4 and Adobe Technote 5014.
-static void append_cmap_bfchar_sections(
-                const SkTDArray<SkUnichar>& glyphUnicode,
-                const SkPDFGlyphSet* subset, SkDynamicMemoryWStream* cmap) {
-    // PDF spec defines that every bf* list can have at most 100 entries.
-    const size_t kMaxEntries = 100;
-    uint16_t glyphId[kMaxEntries];
-    SkUnichar unicode[kMaxEntries];
-    size_t index = 0;
-    for (int i = 0; i < glyphUnicode.count(); i++) {
-        if (glyphUnicode[i] && (subset == NULL || subset->has(i))) {
-            glyphId[index] = i;
-            unicode[index] = glyphUnicode[i];
-            ++index;
-        }
-        if (index == kMaxEntries) {
-            append_cmap_bfchar_table(glyphId, unicode, index, cmap);
-            index = 0;
-        }
-    }
+struct BFChar {
+    uint16_t fGlyphId;
+    SkUnichar fUnicode;
+};
 
-    if (index) {
-        append_cmap_bfchar_table(glyphId, unicode, index, cmap);
+struct BFRange {
+    uint16_t fStart;
+    uint16_t fEnd;
+    SkUnichar fUnicode;
+};
+
+static void append_bfchar_section(const SkTDArray<BFChar>& bfchar,
+                                  SkDynamicMemoryWStream* cmap) {
+    // PDF spec defines that every bf* list can have at most 100 entries.
+    for (int i = 0; i < bfchar.count(); i += 100) {
+        int count = bfchar.count() - i;
+        count = SkMin32(count, 100);
+        cmap->writeDecAsText(count);
+        cmap->writeText(" beginbfchar\n");
+        for (int j = 0; j < count; ++j) {
+            cmap->writeText("<");
+            cmap->writeHexAsText(bfchar[i + j].fGlyphId, 4);
+            cmap->writeText("> <");
+            cmap->writeHexAsText(bfchar[i + j].fUnicode, 4);
+            cmap->writeText(">\n");
+        }
+        cmap->writeText("endbfchar\n");
     }
 }
 
+static void append_bfrange_section(const SkTDArray<BFRange>& bfrange,
+                                   SkDynamicMemoryWStream* cmap) {
+    // PDF spec defines that every bf* list can have at most 100 entries.
+    for (int i = 0; i < bfrange.count(); i += 100) {
+        int count = bfrange.count() - i;
+        count = SkMin32(count, 100);
+        cmap->writeDecAsText(count);
+        cmap->writeText(" beginbfrange\n");
+        for (int j = 0; j < count; ++j) {
+            cmap->writeText("<");
+            cmap->writeHexAsText(bfrange[i + j].fStart, 4);
+            cmap->writeText("> <");
+            cmap->writeHexAsText(bfrange[i + j].fEnd, 4);
+            cmap->writeText("> <");
+            cmap->writeHexAsText(bfrange[i + j].fUnicode, 4);
+            cmap->writeText(">\n");
+        }
+        cmap->writeText("endbfrange\n");
+    }
+}
+
+// Generate <bfchar> and <bfrange> table according to PDF spec 1.4 and Adobe
+// Technote 5014.
+// The function is not static so we can test it in unit tests.
+//
+// Current implementation guarantees bfchar and bfrange entries do not overlap.
+//
+// Current implementation does not attempt aggresive optimizations against
+// following case because the specification is not clear.
+//
+// 4 beginbfchar          1 beginbfchar
+// <0003> <0013>          <0020> <0014>
+// <0005> <0015>    to    endbfchar
+// <0007> <0017>          1 beginbfrange
+// <0020> <0014>          <0003> <0007> <0013>
+// endbfchar              endbfrange
+//
+// Adobe Technote 5014 said: "Code mappings (unlike codespace ranges) may
+// overlap, but succeeding maps superceded preceding maps."
+//
+// In case of searching text in PDF, bfrange will have higher precedence so
+// typing char id 0x0014 in search box will get glyph id 0x0004 first.  However,
+// the spec does not mention how will this kind of conflict being resolved.
+//
+// For the worst case (having 65536 continuous unicode and we use every other
+// one of them), the possible savings by aggressive optimization is 416KB
+// pre-compressed and does not provide enough motivation for implementation.
+void append_cmap_sections(const SkTDArray<SkUnichar>& glyphToUnicode,
+                          const SkPDFGlyphSet* subset,
+                          SkDynamicMemoryWStream* cmap) {
+    if (glyphToUnicode.count() == 0) return;
+
+    SkTDArray<BFChar> bfcharEntries;
+    SkTDArray<BFRange> bfrangeEntries;
+
+    BFRange currentRangeEntry;
+    bool haveBase = false;
+    int continuousEntries = 0;
+
+    for (int i = 0; i < glyphToUnicode.count(); ++i) {
+        if (glyphToUnicode[i] && (subset == NULL || subset->has(i))) {
+            // PDF spec requires bfrange not changing the higher byte,
+            // e.g. <1035> <10FF> <2222> is ok, but
+            //      <1035> <1100> <2222> is no good
+            if (haveBase) {
+                ++continuousEntries;
+                if (i == currentRangeEntry.fStart + continuousEntries &&
+                            (i >> 8) == (currentRangeEntry.fStart >> 8) &&
+                            glyphToUnicode[i] == (currentRangeEntry.fUnicode +
+                                                  continuousEntries)) {
+                            currentRangeEntry.fEnd = i;
+                    if (i == glyphToUnicode.count() - 1) {
+                        // Last entry is in a range.
+                        bfrangeEntries.push(currentRangeEntry);
+                    }
+                    continue;
+                }
+
+                // Need to have at least 2 entries to form a bfrange.
+                if (continuousEntries >= 2) {
+                    bfrangeEntries.push(currentRangeEntry);
+                } else {
+                    BFChar* entry = bfcharEntries.append();
+                    entry->fGlyphId = currentRangeEntry.fStart;
+                    entry->fUnicode = currentRangeEntry.fUnicode;
+                }
+                continuousEntries = 0;
+            }
+
+            if (i != glyphToUnicode.count() - 1) {
+                currentRangeEntry.fStart = i;
+                currentRangeEntry.fEnd = i;
+                currentRangeEntry.fUnicode = glyphToUnicode[i];
+                haveBase = true;
+            } else {
+                BFChar* entry = bfcharEntries.append();
+                entry->fGlyphId = i;
+                entry->fUnicode = glyphToUnicode[i];
+            }
+        }
+    }
+
+    // The spec requires all bfchar entries for a font must come before bfrange
+    // entries.
+    append_bfchar_section(bfcharEntries, cmap);
+    append_bfrange_section(bfrangeEntries, cmap);
+}
+
 static SkPDFStream* generate_tounicode_cmap(
-                        const SkTDArray<SkUnichar>& glyphUnicode,
-                        const SkPDFGlyphSet* subset) {
+        const SkTDArray<SkUnichar>& glyphToUnicode,
+        const SkPDFGlyphSet* subset) {
     SkDynamicMemoryWStream cmap;
     append_tounicode_header(&cmap);
-    append_cmap_bfchar_sections(glyphUnicode, subset, &cmap);
+    append_cmap_sections(glyphToUnicode, subset, &cmap);
     append_cmap_footer(&cmap);
     SkRefPtr<SkMemoryStream> cmapStream = new SkMemoryStream();
     cmapStream->unref();  // SkRefPtr and new took a reference.
diff --git a/tests/ToUnicode.cpp b/tests/ToUnicode.cpp
new file mode 100644
index 0000000..ad1e0a9
--- /dev/null
+++ b/tests/ToUnicode.cpp
@@ -0,0 +1,92 @@
+
+/*
+ * Copyright 2010 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+
+#include <string>
+
+#include "Test.h"
+#include "SkData.h"
+#include "SkPDFTypes.h"
+#include "SkPDFFont.h"
+#include "SkStream.h"
+
+static bool stream_equals(const SkDynamicMemoryWStream& stream, size_t offset,
+                          const void* buffer, size_t len) {
+    SkAutoDataUnref data(stream.copyToData());
+    if (offset + len > data.size()) {
+        return false;
+    }
+    return memcmp(data.bytes() + offset, buffer, len) == 0;
+}
+
+void append_cmap_sections(const SkTDArray<SkUnichar>& glyphToUnicode,
+                          const SkPDFGlyphSet* subset,
+                          SkDynamicMemoryWStream* cmap);
+static void TestToUnicode(skiatest::Reporter* reporter) {
+    SkTDArray<SkUnichar> glyphToUnicode;
+    SkTDArray<uint16_t> glyphsInSubset;
+    SkPDFGlyphSet subset;
+
+    glyphToUnicode.push(0);  // 0
+    glyphToUnicode.push(0);  // 1
+    glyphToUnicode.push(0);  // 2
+    glyphsInSubset.push(3);
+    glyphToUnicode.push(0x20);  // 3
+    glyphsInSubset.push(4);
+    glyphToUnicode.push(0x25);  // 4
+    glyphsInSubset.push(5);
+    glyphToUnicode.push(0x27);  // 5
+    glyphsInSubset.push(6);
+    glyphToUnicode.push(0x28);  // 6
+    glyphsInSubset.push(7);
+    glyphToUnicode.push(0x29);  // 7
+    glyphsInSubset.push(8);
+    glyphToUnicode.push(0x2F);  // 8
+    glyphsInSubset.push(9);
+    glyphToUnicode.push(0x33);  // 9
+    glyphToUnicode.push(0);  // 10
+    glyphsInSubset.push(11);
+    glyphToUnicode.push(0x35);  // 11
+    glyphsInSubset.push(12);
+    glyphToUnicode.push(0x36);  // 12
+    for (uint16_t i = 13; i < 0xFE; ++i) {
+        glyphToUnicode.push(0);  // Zero from index 0x9 to 0xFD
+    }
+    glyphsInSubset.push(0xFE);
+    glyphToUnicode.push(0x1010);
+    glyphsInSubset.push(0xFF);
+    glyphToUnicode.push(0x1011);
+    glyphsInSubset.push(0x100);
+    glyphToUnicode.push(0x1012);
+    glyphsInSubset.push(0x101);
+    glyphToUnicode.push(0x1013);
+
+    SkDynamicMemoryWStream buffer;
+    subset.set(glyphsInSubset.begin(), glyphsInSubset.count());
+    append_cmap_sections(glyphToUnicode, &subset, &buffer);
+
+    char expectedResult[] =
+"4 beginbfchar\n\
+<0003> <0020>\n\
+<0004> <0025>\n\
+<0008> <002F>\n\
+<0009> <0033>\n\
+endbfchar\n\
+4 beginbfrange\n\
+<0005> <0007> <0027>\n\
+<000B> <000C> <0035>\n\
+<00FE> <00FF> <1010>\n\
+<0100> <0101> <1012>\n\
+endbfrange\n";
+
+    REPORTER_ASSERT(reporter, stream_equals(buffer, 0, expectedResult,
+                                            buffer.getOffset()));
+}
+
+#include "TestClassDef.h"
+DEFINE_TESTCLASS("ToUnicode", ToUnicodeTestClass, TestToUnicode)