From f4f70a49c42905d1e249a750ec9c68cd469586f8 Mon Sep 17 00:00:00 2001 From: Tim Brody Date: Fri, 25 Mar 2011 13:02:18 +0000 Subject: [PATCH] Turn TeX-style composed characters into Unicode combining characters during text conversion. --- poppler/TextOutputDev.cc | 39 +++++++++++++++++++++++---- poppler/UnicodeCompEquivTables.h | 54 ++++++++++++++++++++++++++++++++++++++ poppler/UnicodeTypeTable.cc | 21 ++++++++++++++ poppler/UnicodeTypeTable.h | 2 + 4 files changed, 110 insertions(+), 6 deletions(-) create mode 100644 poppler/UnicodeCompEquivTables.h diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 13c67c6..b949986 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -2161,6 +2161,7 @@ void TextPage::addChar(GfxState *state, double x, double y, double x1, y1, w1, h1, dx2, dy2, base, sp, delta; GBool overlap; int i; + Unicode *uc = NULL; // u + combining character // subtract char and word spacing from the dx,dy values sp = state->getCharSpace(); @@ -2236,12 +2237,35 @@ void TextPage::addChar(GfxState *state, double x, double y, } overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize && fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize; - if (overlap || lastCharOverlap || - sp < -minDupBreakOverlap * curWord->fontSize || - sp > minWordBreakSpace * curWord->fontSize || - fabs(base - curWord->base) > 0.5 || - curFontSize != curWord->fontSize) { - endWord(); + // whitespace caused by delta + if (sp > minWordBreakSpace * curWord->fontSize) { + endWord(); + } + // font size changed + else if (curFontSize != curWord->fontSize) { + endWord(); + } + // vertical whitespace + else if (fabs(base - curWord->base) > 0.5 ) { + endWord(); + } + // overlapping characters + else if (overlap || lastCharOverlap || + sp < -minDupBreakOverlap * curWord->fontSize ) { + // "u => ΓΌ, as seen in pdflatex output + Unicode uu; + if (unicodeCombineEquiv (curWord->text[curWord->len - 1], &uu)) { + curWord->len--; + curWord->charLen--; + overlap = gFalse; + uc = (Unicode *) gmallocn (uLen, sizeof (Unicode)); + memcpy (uc, u, uLen * sizeof (Unicode)); + uc[uLen++] = uu; + u = uc; + } + else { + endWord(); + } } lastCharOverlap = overlap; } else { @@ -2293,6 +2317,9 @@ void TextPage::addChar(GfxState *state, double x, double y, } } } + if (uc) { + gfree (uc); + } if (curWord) { curWord->charLen += nBytes; } diff --git a/poppler/UnicodeCompEquivTables.h b/poppler/UnicodeCompEquivTables.h new file mode 100644 index 0000000..7b28ea1 --- /dev/null +++ b/poppler/UnicodeCompEquivTables.h @@ -0,0 +1,54 @@ +// Generated by combining.pl at Thu Mar 24 11:44:21 2011 + +typedef struct { + Unicode character; + Unicode combining; +} combine_equiv; + +#define COMBINE_EQUIV_TABLE_LENGTH 43 + +static const combine_equiv combine_equiv_table[] = { + { 0x0022, 0x030e }, + { 0x0027, 0x0301 }, + { 0x005e, 0x0302 }, + { 0x005f, 0x0332 }, + { 0x0060, 0x0300 }, + { 0x007e, 0x0303 }, + { 0x00a8, 0x0308 }, + { 0x00af, 0x0305 }, + { 0x00b0, 0x030a }, + { 0x00b4, 0x0301 }, + { 0x00b8, 0x0327 }, + { 0x02b1, 0x0324 }, + { 0x02b2, 0x0321 }, + { 0x02b7, 0x032b }, + { 0x02b9, 0x0301 }, + { 0x02ba, 0x030b }, + { 0x02bb, 0x0312 }, + { 0x02bc, 0x0315 }, + { 0x02bd, 0x0314 }, + { 0x02c0, 0x0309 }, + { 0x02c6, 0x0302 }, + { 0x02c7, 0x030c }, + { 0x02c8, 0x030d }, + { 0x02c9, 0x0304 }, + { 0x02ca, 0x0301 }, + { 0x02cb, 0x0300 }, + { 0x02cc, 0x0329 }, + { 0x02cd, 0x0331 }, + { 0x02d4, 0x0323 }, + { 0x02d5, 0x031e }, + { 0x02d6, 0x031f }, + { 0x02d7, 0x0320 }, + { 0x02d8, 0x0306 }, + { 0x02d9, 0x0307 }, + { 0x02da, 0x030a }, + { 0x02db, 0x0328 }, + { 0x02dc, 0x0303 }, + { 0x02dd, 0x030b }, + { 0x0384, 0x0301 }, + { 0x0559, 0x0314 }, + { 0x055a, 0x0313 }, + { 0x0901, 0x0310 }, + { 0x2017, 0x0333 }, +}; diff --git a/poppler/UnicodeTypeTable.cc b/poppler/UnicodeTypeTable.cc index c0483a5..cf4c762 100644 --- a/poppler/UnicodeTypeTable.cc +++ b/poppler/UnicodeTypeTable.cc @@ -22,6 +22,7 @@ #include #include "CharTypes.h" #include "UnicodeTypeTable.h" +#include "UnicodeCompEquivTables.h" #include "goo/gmem.h" struct UnicodeMapTableEntry { @@ -1095,6 +1096,26 @@ static GBool combine(Unicode base, Unicode add, Unicode *out) { (((v) - HANGUL_V_BASE) + (HANGUL_V_COUNT * ((l) - HANGUL_L_BASE))))) #define HANGUL_COMPOSE_LV_T(lv, t) ((lv) + ((t) - HANGUL_T_BASE)) +GBool unicodeCombineEquiv(Unicode in, Unicode *out) { + int start = 0, end = COMBINE_EQUIV_TABLE_LENGTH; + + while (gTrue) { + int midpoint = (start+end) / 2; + if (combine_equiv_table[midpoint].character == in) { + *out = combine_equiv_table[midpoint].combining; + return gTrue; + } + else if (start == midpoint) + break; + else if (in > combine_equiv_table[midpoint].character) + start = midpoint; + else + end = midpoint; + } + + return gFalse; +} + // Converts Unicode string @in of length @len to its normalization in form // NFKC (compatibility decomposition + canonical composition). The length of // the resulting Unicode string is returned in @out_len. If non-NULL, @indices diff --git a/poppler/UnicodeTypeTable.h b/poppler/UnicodeTypeTable.h index 939e916..cabe80e 100644 --- a/poppler/UnicodeTypeTable.h +++ b/poppler/UnicodeTypeTable.h @@ -28,6 +28,8 @@ extern GBool unicodeTypeR(Unicode c); extern Unicode unicodeToUpper(Unicode c); +extern GBool unicodeCombineEquiv(Unicode in, Unicode *out); + extern Unicode *unicodeNormalizeNFKC(Unicode *in, int len, int *out_len, int **offsets); -- 1.7.2.3