From 7a2eed5fb33098b10d67958472a23bb41a3f7b11 Mon Sep 17 00:00:00 2001 From: Mike FABIAN Date: Thu, 13 Sep 2012 12:43:55 +0200 Subject: [PATCH] Improve check whether a phrase is simplified or traditional Chinese The improvement is to ignore all non-Han characters when doing the check. This is to avoid classifying a simplified Chinese string as traditional just because it happens to include some non-Chinese characters, for example box drawing characters, which cannot be converted to gb2312 but happen to be convertible to big5hkscs. This fixes the problem in the emoji-table input method that most phrases cannot be input at all. See: https://bugzilla.redhat.com/show_bug.cgi?id=856320 --- engine/tabsqlitedb.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/engine/tabsqlitedb.py b/engine/tabsqlitedb.py index 7606afb..9e3b50c 100644 --- a/engine/tabsqlitedb.py +++ b/engine/tabsqlitedb.py @@ -483,17 +483,27 @@ class tabsqlitedb: # make sure that we got a unicode string if type(phrase) != type(u''): phrase = phrase.decode('utf8') + tmp_phrase = ''.join(re.findall(u'[' + + u'\u4E00-\u9FCB' + + u'\u3400-\u4DB5' + + u'\uF900-\uFaFF' + + u'\U00020000-\U0002A6D6' + + u'\U0002A700-\U0002B734' + + u'\U0002B740-\U0002B81D' + + u'\U0002F800-\U0002FA1D' + + u']+', + phrase)) # first whether in gb2312 try: - phrase.encode('gb2312') + tmp_phrase.encode('gb2312') category |= 1 except: - if '〇'.decode('utf8') in phrase: + if '〇'.decode('utf8') in tmp_phrase: # we add '〇' into SC as well category |= 1 # second check big5-hkscs try: - phrase.encode('big5hkscs') + tmp_phrase.encode('big5hkscs') category |= 1 << 1 except: # then check whether in gbk, @@ -503,7 +513,7 @@ class tabsqlitedb: else: # need to check try: - phrase.encode('gbk') + tmp_phrase.encode('gbk') category |= 1 except: # not in gbk -- 1.7.11.4