From 7a2eed5fb33098b10d67958472a23bb41a3f7b11 Mon Sep 17 00:00:00 2001
From: Mike FABIAN <mfabian@redhat.com>
Date: Thu, 13 Sep 2012 12:43:55 +0200
Subject: [PATCH] Improve check whether a phrase is simplified or traditional
 Chinese

The improvement is to ignore all non-Han characters when
doing the check.

This is to avoid classifying a simplified Chinese string as
traditional just because it happens to include some non-Chinese
characters, for example box drawing characters, which cannot be
converted to gb2312 but happen to be convertible to big5hkscs.

This fixes the problem in the emoji-table input method that most
phrases cannot be input at all.

See: https://bugzilla.redhat.com/show_bug.cgi?id=856320
---
 engine/tabsqlitedb.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/engine/tabsqlitedb.py b/engine/tabsqlitedb.py
index 7606afb..9e3b50c 100644
--- a/engine/tabsqlitedb.py
+++ b/engine/tabsqlitedb.py
@@ -483,17 +483,27 @@ class tabsqlitedb:
             # make sure that we got a unicode string
             if type(phrase) != type(u''):
                 phrase = phrase.decode('utf8')
+            tmp_phrase = ''.join(re.findall(u'['
+                                            + u'\u4E00-\u9FCB'
+                                            + u'\u3400-\u4DB5'
+                                            + u'\uF900-\uFaFF'
+                                            + u'\U00020000-\U0002A6D6'
+                                            + u'\U0002A700-\U0002B734'
+                                            + u'\U0002B740-\U0002B81D'
+                                            + u'\U0002F800-\U0002FA1D'
+                                            + u']+',
+                                            phrase))
             # first whether in gb2312
             try:
-                phrase.encode('gb2312')
+                tmp_phrase.encode('gb2312')
                 category |= 1
             except:
-                if '〇'.decode('utf8') in phrase:
+                if '〇'.decode('utf8') in tmp_phrase:
                     # we add '〇' into SC as well
                     category |= 1
             # second check big5-hkscs
             try:
-                phrase.encode('big5hkscs')
+                tmp_phrase.encode('big5hkscs')
                 category |= 1 << 1
             except:
                 # then check whether in gbk,
@@ -503,7 +513,7 @@ class tabsqlitedb:
                 else:
                     # need to check
                     try:
-                        phrase.encode('gbk')
+                        tmp_phrase.encode('gbk')
                         category |= 1
                     except:
                         # not in gbk
-- 
1.7.11.4