diff --git a/.cvsignore b/.cvsignore index 6e58d5a..7136913 100644 --- a/.cvsignore +++ b/.cvsignore @@ -1 +1 @@ -buckwalter_morphan_1_hunspell_patch.tar.gz +hunspell-ar_20080110.tar.gz diff --git a/buckwalter2unicode.py b/buckwalter2unicode.py deleted file mode 100644 index 94fec32..0000000 --- a/buckwalter2unicode.py +++ /dev/null @@ -1,453 +0,0 @@ -#!/usr/bin/python - -# buckwalter2unicode.py - A script to convert transliterated Arabic -# (using the Buckwalter system) to Unicode. -# -# Version 0.2 - 15th September 2004 -# -# Andrew Roberts (andyr [at] comp (dot) leeds [dot] ac (dot) uk) -# -# Project homepage: http://www.comp.leeds.ac.uk/andyr/software/ -# -# Now, listen carefully... -# -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -# - -import sys, getopt, codecs, os, re - -# Declare a dictionary with Buckwalter's ASCII symbols as the keys, and -# their unicode equivalents as values. - -buck2uni = {"'": u"\u0621", # hamza-on-the-line - "|": u"\u0622", # madda - ">": u"\u0623", # hamza-on-'alif - "&": u"\u0624", # hamza-on-waaw - "<": u"\u0625", # hamza-under-'alif - "}": u"\u0626", # hamza-on-yaa' - "A": u"\u0627", # bare 'alif - "b": u"\u0628", # baa' - "p": u"\u0629", # taa' marbuuTa - "t": u"\u062A", # taa' - "v": u"\u062B", # thaa' - "j": u"\u062C", # jiim - "H": u"\u062D", # Haa' - "x": u"\u062E", # khaa' - "d": u"\u062F", # daal - "*": u"\u0630", # dhaal - "r": u"\u0631", # raa' - "z": u"\u0632", # zaay - "s": u"\u0633", # siin - "$": u"\u0634", # shiin - "S": u"\u0635", # Saad - "D": u"\u0636", # Daad - "T": u"\u0637", # Taa' - "Z": u"\u0638", # Zaa' (DHaa') - "E": u"\u0639", # cayn - "g": u"\u063A", # ghayn - "_": u"\u0640", # taTwiil - "f": u"\u0641", # faa' - "q": u"\u0642", # qaaf - "k": u"\u0643", # kaaf - "l": u"\u0644", # laam - "m": u"\u0645", # miim - "n": u"\u0646", # nuun - "h": u"\u0647", # haa' - "w": u"\u0648", # waaw - "Y": u"\u0649", # 'alif maqSuura - "y": u"\u064A", # yaa' - "F": u"\u064B", # fatHatayn - "N": u"\u064C", # Dammatayn - "K": u"\u064D", # kasratayn - "a": u"\u064E", # fatHa - "u": u"\u064F", # Damma - "i": u"\u0650", # kasra - "~": u"\u0651", # shaddah - "o": u"\u0652", # sukuun - "`": u"\u0670", # dagger 'alif - "{": u"\u0671", # waSla -} - -# For a reverse transliteration (Unicode -> Buckwalter), a dictionary -# which is the reverse of the above buck2uni is essential. - -uni2buck = {} - -# Iterate through all the items in the buck2uni dict. -for (key, value) in buck2uni.iteritems(): - # The value from buck2uni becomes a key in uni2buck, and vice - # versa for the keys. - uni2buck[value] = key - -# Declare some global variables... - - -inFilename = "" # Name of filename containing input. -outFilename = "" # Name of filename to send the output -inEnc = "" # The text encoding of the input file -outEnc = "" # The text encoding for the output file -ignoreChars = "" # If lines begin with these symbols, ignore. -columnRange = "" # Holds columns numbers to transliterate. -delimiter = "" # Holds user-defined column delimiter. -reverse = 0 # When equal to 1, perform reverse transliteration, i.e., - # Unicode -> Buckwalter. - -# A function to print to screen the usage details of this script. - -def usage(): - print "Usage:", sys.argv[0], "-i INFILE -o OUTFILE [-g CHARS -c RANGE -d CHAR" - print " -r -e INPUT_ENCODING, -E OUTPUT ENCODING]" - print " ", sys.argv[0], "-l" - print " ", sys.argv[0], "-h" - print "" - print " -i INFILE, --input=INFILE:" - print " Path to text file to be transliterated to Unicode." - print " -o OUTFILE, --output=OUTFILE:" - print " Path of file to output the newly transliterated text." - print " -e ENC, --input-encoding=ENC:" - print " Specify the text encoding of the source file. Default: latin_1." - print " -E ENC, --output-encoding=ENC:" - print " Specify the text encoding of the target file. Default: utf_8." - print " -g CHARS, --ignore-lines=CHARS:" - print " Will not transliterate lines that start with any of the CHARS" - print " given. E.g., -g #; will not alter lines starting with # or ;." - print " (May need to be -g \#\; on some platforms. See README.txt.)" - print " -c RANGE, --columns=RANGE:" - print " If in columns, select columns to apply transliteration. Can be" - print " comma separated numbers, or a range. E.g., -c 1, -c 1-3, -c 1,3." - print " -d CHAR, --delimiter=CHAR:" - print " Specify the delimiter that defines the column if using the -c" - print " option above. Default is ' ' (space)." - print " -r, --reverse:" - print " Reverses the transliteration, i.e., Arabic to Buckwalter." - print " When used, it will change the default input encoding to utf_8 and" - print " output encoding to latin_1" - print " -l, --list-encodings:" - print " Displays all supported file encodings." - print " -h, --help:" - print " Displays this page." - print "" - -# A function to print to screen all the available encodings supported by -# Python. - -def displayEncodings(): - print "Codec Aliases Languages" - print "ascii 646, us-ascii English" - print "cp037 IBM037, IBM039 English" - print "cp424 EBCDIC-CP-HE, IBM424 Hebrew" - print "cp437 437, IBM437 English" - print "cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 Western Europe" - print "cp737 Greek" - print "cp775 IBM775 Baltic languages" - print "cp850 850, IBM850 Western Europe" - print "cp852 852, IBM852 Central and Eastern Europe" - print "cp855 855, IBM855 Bulgarian, Byelorussian, Macedonian, Russian, Serbian" - print "cp856 Hebrew" - print "cp857 857, IBM857 Turkish" - print "cp860 860, IBM860 Portuguese" - print "cp861 861, CP-IS, IBM861 Icelandic" - print "cp862 862, IBM862 Hebrew" - print "cp863 863, IBM863 Canadian" - print "cp864 IBM864 Arabic" - print "cp865 865, IBM865 Danish, Norwegian" - print "cp869 869, CP-GR, IBM869 Greek" - print "cp874 Thai" - print "cp875 Greek" - print "cp1006 Urdu" - print "cp1026 ibm1026 Turkish" - print "cp1140 ibm1140 Western Europe" - print "cp1250 windows-1250 Central and Eastern Europe" - print "cp1251 windows-1251 Bulgarian, Byelorussian, Macedonian, Russian, Serbian" - print "cp1252 windows-1252 Western Europe" - print "cp1253 windows-1253 Greek" - print "cp1254 windows-1254 Turkish" - print "cp1255 windows-1255 Hebrew" - print "cp1256 windows-1256 Arabic" - print "cp1257 windows-1257 Baltic languages" - print "cp1258 windows-1258 Vietnamese" - print "latin_1 iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1 West Europe" - print "iso8859_2 iso-8859-2, latin2, L2 Central and Eastern Europe" - print "iso8859_3 iso-8859-3, latin3, L3 Esperanto, Maltese" - print "iso8859_4 iso-8859-4, latin4, L4 Baltic languagues" - print "iso8859_5 iso-8859-5, cyrillic Bulgarian, Byelorussian, Macedonian, Russian, Serbian" - print "iso8859_6 iso-8859-6, arabic Arabic" - print "iso8859_7 iso-8859-7, greek, greek8 Greek" - print "iso8859_8 iso-8859-8, hebrew Hebrew" - print "iso8859_9 iso-8859-9, latin5, L5 Turkish" - print "iso8859_10 iso-8859-10, latin6, L6 Nordic languages" - print "iso8859_13 iso-8859-13 Baltic languages" - print "iso8859_14 iso-8859-14, latin8, L8 Celtic languages" - print "iso8859_15 iso-8859-15 Western Europe" - print "koi8_r Russian" - print "koi8_u Ukrainian" - print "mac_cyrillic maccyrillic Bulgarian, Byelorussian, Macedonian, Russian, Serbian" - print "mac_greek macgreek Greek" - print "mac_iceland maciceland Icelandic" - print "mac_latin2 maclatin2, maccentraleurope Central and Eastern Europe" - print "mac_roman macroman Western Europe" - print "mac_turkish macturkish Turkish" - print "utf_16 U16, utf16 all languages" - print "utf_16_be UTF-16BE all languages (BMP only)" - print "utf_16_le UTF-16LE all languages (BMP only)" - print "utf_7 U7 all languages" - print "utf_8 U8, UTF, utf8 all languages" - -def parseIgnoreString(string): - - symbols = [] - - for char in string: - symbols.append(char) - - return symbols - -# Begin parsing the command-line arguments... - -try: - (options, args) = getopt.getopt(sys.argv[1:], "i:o:e:E:g:c:d:rlh", - ["input=","output=", "input-encoding=", "output-encoding=", - "ignore-lines=", "columns=", "delimiter=" "reverse", "list-encodings", - "help"]) - -except getopt.GetoptError: - # print help information and exit: - usage() - sys.exit(1) - -# Loop over all arguments supplied by the user. -for (x, y) in options: - if x in ("-h", "--help"): - usage() - sys.exit(0) - - if x in ("-l", "--list-encodings"): - displayEncodings() - sys.exit(0) - - if x in ("-i", "--input"): inFilename = y - if x in ("-o", "--output"): outFilename = y - if x in ("-e", "--input-encoding"): inEnc= y - if x in ("-E", "--output-encoding"): outEnc= y - if x in ("-r", "--reverse"): reverse = 1 - if x in ("-g", "--ignore-lines"): ignoreChars = y - if x in ("-c", "--columns"): columnRange = y - if x in ("-d", "--delimiter"): - delimiter = y - # Tabs come in off the command line from "\\t" to "\t". However, - # that's equivalent to "\\t" from python's point of view. - # Therefore replace any inputted "tabs" with proper tabs before - # proceeding. - delimiter = delimiter.replace("\\t", "\t") - # Do some error checking - if len(delimiter) > 1: - print >>sys.stderr, "Delimeter should only be a single character. Using first character" + delimiter[0] - delimiter = delimiter[0] - - if buck2uni.get(delimiter): - print >> sys.stderr, "Invalid delimiter. \"" + delimiter + "\" is part of the Buckwalter character set." - print >> sys.stderr, "This will obviously cause much confusion as a delimiter!" - print >> sys.stderr, "Please try again. Aborting..." - sys.exit(1) - -# If no delimiter was set then, set the default to " " (space) -if not delimiter: - delimiter = " " - -# If user didn't specify the encoding of the input file, then revert to -# defaults. The defaults can depending on the direction of -# transliteration: -# -# Buckwalter -> Unicode, default = latin1 -# Unicode -> Buckwalter, default = utf_8 - - -if not inEnc: - if reverse: - inEnc = "utf_8" - else: - inEnc = "latin_1" - -# Similarly, if user didn't specify the encoding of the output file, -# then revert to defaults. The defaults can depending on the direction -# of transliteration: -# -# Buckwalter -> Unicode, default = utf_8 -# Unicode -> Buckwalter, default # = latin_1 - -if not outEnc: - if reverse: - outEnc = "latin_1" - else: - outEnc = "utf_8" - -# Ok, let's get the files open! - -# Providing a file for output was specified... -if outFilename: - try: - # Create a file object, set it to "write" mode using the - # specified output encoding. - outFile = codecs.open(outFilename, "w", outEnc) - - except IOError, msg: - # A problem occurred when trying to open this file. Report to - # user... - print msg - sys.exit(1) - -# Script can not work without somewhere to store the transliteration. -# Exit. -else: - print "Must specify a file to use store the output! Aborting..." - sys.exit(1) - -# Providing a file for input was specified... -if inFilename: - try: - # Create a file object, set it to "read" mode using the - # specified input encoding. - inFile = codecs.open(inFilename, "r", inEnc) - - except IOError, msg: - # A problem occurred when trying to open this file. Report to - # user... - print msg - sys.exit(1) - -# This script requires a file to read from. Exit. -else: - print "Must specify a file to use as input! Aborting..." - sys.exit(1) - -def getColsFromRange(cRange): - - columns = [] - hyphenSearch = re.compile(r'-') - - rangeElements = cRange.split(",") - - for i in rangeElements: - # If it contains a hyphen (e.g., 1-3) - if hyphenSearch.search(i): - [start, end] = i.split("-") - columns = columns + range(int(start)-1,int(end)) - else: - columns.append(int(i)-1) - - return columns - -# This function transliterates a given string. It checks the direction -# of the transliteration and then uses the appropriate dictionary. A -# transliterated string is returned. - -def transliterate(inString, lineNumber): - out = "" - - if columnRange: - columns = getColsFromRange(columnRange) - - # Split the line on the delimiter - lineCols = inString.split(delimiter) - - # Iterate over each column. If it's one of the ones in the range - # specified, then transliterate, otherwise just output column - # unchanged. - - for i in range(len(lineCols)): - - # If first column, then don't prefix the delimiter - if i == 0: - if i in columns: - out = transliterateString(lineCols[i]) - else : - out = lineCols[i] - else : - if i in columns: - out = out + delimiter + transliterateString(lineCols[i]) - else : - out = out + delimiter + lineCols[i] - - else: - out = transliterateString(inString) - - - - return out - -def transliterateString(inString): - - out = "" - - # For normal Buckwalter -> Unicode transliteration.. - if not reverse: - - # Loop over each character in the string, inString. - for char in inString: - # Look up current char in the dictionary to get its - # respective value. If there is no match, e.g., chars like - # spaces, then just stick with the current char without any - # conversion. - out = out + buck2uni.get(char, char) - - # Same as above, just in the other direction. - else: - - for char in inString: - out = out + uni2buck.get(char, char) - - return out - -#while 1: -# line = inFile.readline().strip() -# line = line.decode(inEnc) -# if not line: -# break - - # process string -# outFile.write(transliterate(line) + os.linesep) - -# Read in the lines of the input file. -lines = inFile.readlines() - -currentLineNumber = 1 -# Loop over each line -for line in lines: - line = line.strip() - try: - # Transliterate the current line, and then write the output to - # file. - - if not ignoreChars: - outFile.write(transliterate(line, currentLineNumber) + " " + os.linesep) - else: - if line[0] in parseIgnoreString(ignoreChars): - outFile.write(line + " " + os.linesep) - else: - outFile.write(transliterate(line, currentLineNumber) + " " + os.linesep) - - currentLineNumber = currentLineNumber + 1 - - except UnicodeError, msg: - # A problem when writing occurred. Report to user... - print msg - sys.exit(1) - -# All done! Better close the files used before terminating... -inFile.close() -outFile.close() - -# ... and relax! :) diff --git a/hunspell-ar.spec b/hunspell-ar.spec index 04b5e60..f0e7fe5 100644 --- a/hunspell-ar.spec +++ b/hunspell-ar.spec @@ -1,14 +1,13 @@ Name: hunspell-ar Summary: Arabic hunspell dictionaries -%define upstreamid 20060208 +%define upstreamid 20080110 Version: 0.%{upstreamid} Release: 1%{?dist} -Source0: http://hunspell.sourceforge.net/buckwalter_morphan_1_hunspell_patch.tar.gz -Source1: http://www.andy-roberts.net/software/buckwalter2unicode/buckwalter2unicode.py +Source: http://downloads.sourceforge.net/ayaspell/hunspell-ar_%{upstreamid}.tar.gz Group: Applications/Text -URL: http://hunspell.sourceforge.net +URL: http://ayaspell.sourceforge.net/ BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) -License: GPL +License: GPLv2 or LGPLv2 or MPLv1.1 BuildArch: noarch Requires: hunspell @@ -17,33 +16,21 @@ Requires: hunspell Arabic (Egypt, Algeria, etc.) hunspell dictionaries %prep -%setup -q -n buckwalter_morphan_1_hunspell_patch -cp %{SOURCE1} convert2hunspell -chmod +x convert2hunspell/buckwalter2unicode.py +%setup -q -n %{name}_%{upstreamid} %build -cd convert2hunspell -make - -./buckwalter2unicode.py -i arabic_alias.dic -o ../ar_LB.dic -grep -v '^.FX' arabic_alias.aff > nonFX -grep '^.FX' arabic_alias.aff > FX.in -./buckwalter2unicode.py -i FX.in -d ' ' -c 4 -o FX -cat nonFX FX > ../ar_LB.aff - -tr -d '\r' < ../doc/readme.txt > ../doc/readme.txt.new -mv -f ../doc/readme.txt.new ../doc/readme.txt %install rm -rf $RPM_BUILD_ROOT mkdir -p $RPM_BUILD_ROOT/%{_datadir}/myspell -cp -p *.dic *.aff $RPM_BUILD_ROOT/%{_datadir}/myspell +cp -p ar.dic $RPM_BUILD_ROOT/%{_datadir}/myspell/ar_TN.dic +cp -p ar.aff $RPM_BUILD_ROOT/%{_datadir}/myspell/ar_TN.aff pushd $RPM_BUILD_ROOT/%{_datadir}/myspell/ -ar_LB_aliases="ar_AE ar_BH ar_DZ ar_EG ar_IN ar_IQ ar_JO ar_KW ar_LY ar_MA ar_OM ar_QA ar_SA ar_SD ar_SY ar_TN ar_YE" -for lang in $ar_LB_aliases; do - ln -s ar_LB.aff $lang.aff - ln -s ar_LB.dic $lang.dic +ar_TN_aliases="ar_AE ar_BH ar_DZ ar_EG ar_IN ar_IQ ar_JO ar_LB ar_KW ar_LY ar_MA ar_OM ar_QA ar_SA ar_SD ar_SY ar_YE" +for lang in $ar_TN_aliases; do + ln -s ar_TN.aff $lang.aff + ln -s ar_TN.dic $lang.dic done popd @@ -52,9 +39,12 @@ rm -rf $RPM_BUILD_ROOT %files %defattr(-,root,root,-) -%doc gpl.txt doc/readme.txt convert2hunspell/README +%doc AUTHORS ChangeLog-ar COPYING README-ar THANKS %{_datadir}/myspell/* %changelog +* Thu Apr 28 2008 Caolan McNamara - 0.20080110-1 +- use the much lighter ayaspell data instead of Buckwalter + * Wed Jun 06 2007 Caolan McNamara - 0.20060208-1 - initial version diff --git a/sources b/sources index c567644..9ea39aa 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -28c3b7dce962051d55b23ac5e51f264b buckwalter_morphan_1_hunspell_patch.tar.gz +69990932920960eb765fd35883640124 hunspell-ar_20080110.tar.gz