From 26543fafc3143497fd8cc70a0a7e9777ed61ea07 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Tue, 27 Dec 2011 11:30:33 +0800 Subject: [PATCH] update to 0.4.91 --- libpinyin-0.3.x-head.patch | 86 ---------------------- libpinyin-0.5.x-head.patch | 141 +++++++++++++++++++++++++++++++++++++ libpinyin.spec | 9 ++- 3 files changed, 147 insertions(+), 89 deletions(-) delete mode 100644 libpinyin-0.3.x-head.patch create mode 100644 libpinyin-0.5.x-head.patch diff --git a/libpinyin-0.3.x-head.patch b/libpinyin-0.3.x-head.patch deleted file mode 100644 index 4b1512a..0000000 --- a/libpinyin-0.3.x-head.patch +++ /dev/null @@ -1,86 +0,0 @@ -From f332a01334342bdd4169324bdf889386ff3676fa Mon Sep 17 00:00:00 2001 -From: Peng Wu -Date: Thu, 24 Nov 2011 13:02:10 +0800 -Subject: [PATCH 1/3] increase train_factor because of larger model data - ---- - src/lookup/pinyin_lookup.cpp | 4 ++-- - 1 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/src/lookup/pinyin_lookup.cpp b/src/lookup/pinyin_lookup.cpp -index d6ba68c..7146e51 100644 ---- a/src/lookup/pinyin_lookup.cpp -+++ b/src/lookup/pinyin_lookup.cpp -@@ -449,7 +449,7 @@ bool PinyinLookup::train_result(PinyinKeyVector keys, CandidateConstraints const - //TODO: verify the new training method. - phrase_token_t last_token = sentence_start; - // constraints->len + 1 == results->len -- guint32 train_factor = 23; -+ guint32 train_factor = 23 * 5; - for ( size_t i = 0; i < constraints->len; ++i){ - phrase_token_t * token = &g_array_index(results, phrase_token_t, i); - if ( *token == null_token ) -@@ -466,7 +466,7 @@ bool PinyinLookup::train_result(PinyinKeyVector keys, CandidateConstraints const - //std::cout<<"i:"<get_phrase_item(*token, m_cache_phrase_item); - m_cache_phrase_item.increase_pinyin_possibility(*m_custom, pinyin_keys + i, train_factor); -- m_phrase_index->add_unigram_frequency(*token, train_factor); -+ m_phrase_index->add_unigram_frequency(*token, train_factor * 10); - if ( last_token ){ - SingleGram * system, *user; - m_system_bigram->load(last_token, system); --- -1.7.7.3 - - -From de8057576011eb536d87194da10c9ec48dd8d092 Mon Sep 17 00:00:00 2001 -From: Peng Wu -Date: Fri, 25 Nov 2011 14:58:45 +0800 -Subject: [PATCH 2/3] add const modifiers to train factor - ---- - src/lookup/pinyin_lookup.cpp | 2 +- - 1 files changed, 1 insertions(+), 1 deletions(-) - -diff --git a/src/lookup/pinyin_lookup.cpp b/src/lookup/pinyin_lookup.cpp -index 7146e51..e2f563c 100644 ---- a/src/lookup/pinyin_lookup.cpp -+++ b/src/lookup/pinyin_lookup.cpp -@@ -449,7 +449,7 @@ bool PinyinLookup::train_result(PinyinKeyVector keys, CandidateConstraints const - //TODO: verify the new training method. - phrase_token_t last_token = sentence_start; - // constraints->len + 1 == results->len -- guint32 train_factor = 23 * 5; -+ const guint32 train_factor = 23 * 5; - for ( size_t i = 0; i < constraints->len; ++i){ - phrase_token_t * token = &g_array_index(results, phrase_token_t, i); - if ( *token == null_token ) --- -1.7.7.3 - - -From 47dca981b4d0f155f80087ee892bd2ff80429e7c Mon Sep 17 00:00:00 2001 -From: Peng Wu -Date: Fri, 25 Nov 2011 15:18:42 +0800 -Subject: [PATCH 3/3] update lambda parameter - ---- - src/include/novel_types.h | 2 +- - 1 files changed, 1 insertions(+), 1 deletions(-) - -diff --git a/src/include/novel_types.h b/src/include/novel_types.h -index 1c4fb2b..110d041 100644 ---- a/src/include/novel_types.h -+++ b/src/include/novel_types.h -@@ -144,7 +144,7 @@ typedef guint32 table_offset_t; - - typedef double parameter_t; - --#define LAMBDA_PARAMETER 0.588792 -+#define LAMBDA_PARAMETER 0.330642 - - /* Array of phrase_token_t */ - typedef GArray * TokenVector; --- -1.7.7.3 - diff --git a/libpinyin-0.5.x-head.patch b/libpinyin-0.5.x-head.patch new file mode 100644 index 0000000..a18ee90 --- /dev/null +++ b/libpinyin-0.5.x-head.patch @@ -0,0 +1,141 @@ +From 11dfb3b72c7128e05e8608ff501d06b80a2788c1 Mon Sep 17 00:00:00 2001 +From: Peng Wu +Date: Mon, 26 Dec 2011 14:56:09 +0800 +Subject: [PATCH 1/4] fixes pinyin_translate_token + +--- + src/pinyin.cpp | 4 ++-- + 1 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/src/pinyin.cpp b/src/pinyin.cpp +index f4a20fe..a3fd37d 100644 +--- a/src/pinyin.cpp ++++ b/src/pinyin.cpp +@@ -590,11 +590,11 @@ bool pinyin_translate_token(pinyin_instance_t * instance, + PhraseItem item; + utf16_t buffer[MAX_PHRASE_LENGTH]; + +- bool retval = context->m_phrase_index->get_phrase_item(token, item); ++ int retval = context->m_phrase_index->get_phrase_item(token, item); + item.get_phrase_string(buffer); + guint8 length = item.get_phrase_length(); + *word = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL); +- return retval; ++ return retval == ERROR_OK; + } + + bool pinyin_train(pinyin_instance_t * instance){ +-- +1.7.7.4 + + +From 49869f6917edf488f0daca22e32a8166cf6e0325 Mon Sep 17 00:00:00 2001 +From: Peng Wu +Date: Mon, 26 Dec 2011 15:17:46 +0800 +Subject: [PATCH 2/4] increase train factor + +--- + src/lookup/pinyin_lookup.cpp | 2 +- + src/pinyin.cpp | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/src/lookup/pinyin_lookup.cpp b/src/lookup/pinyin_lookup.cpp +index a01b28a..efad86f 100644 +--- a/src/lookup/pinyin_lookup.cpp ++++ b/src/lookup/pinyin_lookup.cpp +@@ -446,7 +446,7 @@ bool PinyinLookup::train_result(ChewingKeyVector keys, CandidateConstraints cons + //TODO: verify the new training method. + phrase_token_t last_token = sentence_start; + // constraints->len + 1 == results->len +- const guint32 train_factor = 23 * 5; ++ const guint32 train_factor = 23 * 25; + for ( size_t i = 0; i < constraints->len; ++i){ + phrase_token_t * token = &g_array_index(results, phrase_token_t, i); + if ( *token == null_token ) +diff --git a/src/pinyin.cpp b/src/pinyin.cpp +index a3fd37d..5a1b683 100644 +--- a/src/pinyin.cpp ++++ b/src/pinyin.cpp +@@ -594,7 +594,7 @@ bool pinyin_translate_token(pinyin_instance_t * instance, + item.get_phrase_string(buffer); + guint8 length = item.get_phrase_length(); + *word = g_utf16_to_utf8(buffer, length, NULL, NULL, NULL); +- return retval == ERROR_OK; ++ return ERROR_OK == retval; + } + + bool pinyin_train(pinyin_instance_t * instance){ +-- +1.7.7.4 + + +From 2fd2eea102bfa32d662dca823bf1cfeed3a94c1c Mon Sep 17 00:00:00 2001 +From: Peng Wu +Date: Mon, 26 Dec 2011 18:34:49 +0800 +Subject: [PATCH 3/4] fixes parallel make + +--- + data/Makefile.am | 10 ++++++++-- + 1 files changed, 8 insertions(+), 2 deletions(-) + +diff --git a/data/Makefile.am b/data/Makefile.am +index 25e4683..6f74f4e 100644 +--- a/data/Makefile.am ++++ b/data/Makefile.am +@@ -34,16 +34,22 @@ libpinyin_dbdir = $(pkgdatadir)/data + + CLEANFILES = $(binary_model_data) + +-$(textual_model_data): ++model.text.tar.gz: + wget https://github.com/downloads/libpinyin/libpinyin/model.text.tar.gz ++ ++interpolation.text: model.text.tar.gz + tar xvf model.text.tar.gz + +-$(binary_model_data): $(textual_model_data) ++gb_char.table gbk_char.table: interpolation.text ++ ++bigram.db: $(textual_model_data) + $(RM) $(binary_model_data) + ../utils/storage/gen_binary_files --table-dir $(top_srcdir)/data + ../utils/storage/import_interpolation < $(top_srcdir)/data/interpolation.text + ../utils/training/gen_unigram + ++gb_char.bin gbk_char.bin phrase_index.bin pinyin_index.bin: bigram.db ++ + rebuild: + git reset --hard + sed -i -e "s/#define LAMBDA_PARAMETER 0.588792/#define LAMBDA_PARAMETER $(LAMBDA_PARAMETER)/" ../src/include/novel_types.h +-- +1.7.7.4 + + +From 13aac5af9c6e3f3a2e243806f102e7477b686c91 Mon Sep 17 00:00:00 2001 +From: Peng Wu +Date: Tue, 27 Dec 2011 11:09:06 +0800 +Subject: [PATCH 4/4] fixes data/Makefile.am + +--- + data/Makefile.am | 4 +--- + 1 files changed, 1 insertions(+), 3 deletions(-) + +diff --git a/data/Makefile.am b/data/Makefile.am +index 6f74f4e..2964bb9 100644 +--- a/data/Makefile.am ++++ b/data/Makefile.am +@@ -34,10 +34,8 @@ libpinyin_dbdir = $(pkgdatadir)/data + + CLEANFILES = $(binary_model_data) + +-model.text.tar.gz: ++interpolation.text: + wget https://github.com/downloads/libpinyin/libpinyin/model.text.tar.gz +- +-interpolation.text: model.text.tar.gz + tar xvf model.text.tar.gz + + gb_char.table gbk_char.table: interpolation.text +-- +1.7.7.4 + diff --git a/libpinyin.spec b/libpinyin.spec index af3d135..0c0d22d 100644 --- a/libpinyin.spec +++ b/libpinyin.spec @@ -1,12 +1,12 @@ Name: libpinyin -Version: 0.3.0 -Release: 2%{?dist} +Version: 0.4.91 +Release: 1%{?dist} Summary: Library to deal with pinyin License: GPLv2+ URL: https://github.com/libpinyin/libpinyin Source0: https://github.com/downloads/libpinyin/libpinyin/%{name}-%{version}.tar.gz -Patch0: libpinyin-0.3.x-head.patch +Patch0: libpinyin-0.5.x-head.patch BuildRequires: db4-devel, glib2-devel Requires: %{name}-data = %{version}-%{release} @@ -70,6 +70,9 @@ find $RPM_BUILD_ROOT -name '*.la' -exec rm -f {} ';' %{_datadir}/libpinyin/data %changelog +* Tue Dec 27 2011 Peng Wu - 0.4.91-1 +- Update to 0.4.91 + * Fri Nov 25 2011 Peng Wu - 0.3.0-2 - Increase train factor