1632 lines
53 KiB
Diff
1632 lines
53 KiB
Diff
diff -ru mythes-en-3.0/wn2ooo/wn2ooo.py mythes-en-3.0.fixed/wn2ooo/wn2ooo.py
|
|
--- mythes-en-3.0/wn2ooo/wn2ooo.py 2005-07-23 23:21:20.000000000 +0100
|
|
+++ mythes-en-3.0.fixed/wn2ooo/wn2ooo.py 2018-07-30 14:46:52.695201526 +0100
|
|
@@ -65,7 +65,7 @@
|
|
entry = getRelatedTerms(u, HYPERNYM, '')
|
|
try:
|
|
listpos = l.index(entry)
|
|
- except ValueError, e:
|
|
+ except ValueError as e:
|
|
l.append(entry)
|
|
return str.join("|", l)
|
|
|
|
@@ -74,12 +74,12 @@
|
|
for word in words:
|
|
l = []
|
|
if c % 100 == 0:
|
|
- print >> sys.stderr, "Working on word %d" % c
|
|
+ print("Working on word %d" % c, file=sys.stderr)
|
|
for pos in [ADJ, N, V, ADV]:
|
|
try:
|
|
for s in pos[word].getSenses():
|
|
l.append(s)
|
|
- except KeyError, e:
|
|
+ except KeyError as e:
|
|
#print >> sys.stderr, e
|
|
continue
|
|
syn_count = 0
|
|
@@ -118,7 +118,7 @@
|
|
syn_lines = "%s%s" % (syn_lines, more_generic_terms)
|
|
syn_count = syn_count + 1
|
|
if syn_count > 0:
|
|
- print "%s|%d\n%s" % (word, syn_count, syn_lines)
|
|
+ print("%s|%d\n%s" % (word, syn_count, syn_lines))
|
|
c = c + 1
|
|
return
|
|
|
|
@@ -132,40 +132,38 @@
|
|
return s
|
|
|
|
def main():
|
|
- print "ISO8859-1"
|
|
+ print("ISO8859-1")
|
|
|
|
words = {}
|
|
dic = Dictionary(ADJECTIVE, "adj")
|
|
- for w in dic.keys():
|
|
+ for w in list(dic.keys()):
|
|
words[w] = None
|
|
|
|
dic = Dictionary(NOUN, "noun")
|
|
- for w in dic.keys():
|
|
+ for w in list(dic.keys()):
|
|
words[w] = None
|
|
|
|
dic = Dictionary(VERB, "verb")
|
|
- for w in dic.keys():
|
|
+ for w in list(dic.keys()):
|
|
words[w] = None
|
|
|
|
dic = Dictionary(ADVERB, "adv")
|
|
- for w in dic.keys():
|
|
+ for w in list(dic.keys()):
|
|
words[w] = None
|
|
|
|
- words = words.keys()
|
|
+ words = list(words.keys())
|
|
# tests:
|
|
#words = ['dog', 'house', 'nullipara']
|
|
#words = ['absent', 'whistle stop']
|
|
#words = ['war']
|
|
- print >>sys.stderr, "Dictionaries contain %d words" % len(words)
|
|
- print >>sys.stderr, "Sorting..."
|
|
- words.sort(mycompare)
|
|
+ print("Dictionaries contain %d words" % len(words), file=sys.stderr)
|
|
+ print("Sorting...", file=sys.stderr)
|
|
+ words = sorted(words, key=mycompare)
|
|
printSynsForWords(words)
|
|
return
|
|
|
|
-def mycompare(a, b):
|
|
+def mycompare(elem):
|
|
# stupid hack to make sure the list is sorted like Kevin's original list:
|
|
- a = a.replace(" ", "Z")
|
|
- b = b.replace(" ", "Z")
|
|
- return cmp(a, b)
|
|
+ return elem.replace(" ", "Z")
|
|
|
|
main()
|
|
diff -ru mythes-en-3.0/wn2ooo/wordnet.py mythes-en-3.0.fixed/wn2ooo/wordnet.py
|
|
--- mythes-en-3.0/wn2ooo/wordnet.py 2005-07-23 23:21:16.000000000 +0100
|
|
+++ mythes-en-3.0.fixed/wn2ooo/wordnet.py 2018-07-30 14:46:52.695201526 +0100
|
|
@@ -44,7 +44,6 @@
|
|
import string
|
|
import os
|
|
from os import environ
|
|
-from types import IntType, ListType, StringType, TupleType
|
|
|
|
|
|
#
|
|
@@ -212,15 +211,15 @@
|
|
|
|
def __init__(self, line):
|
|
"""Initialize the word from a line of a WN POS file."""
|
|
- tokens = string.split(line)
|
|
- ints = map(int, tokens[int(tokens[3]) + 4:])
|
|
- self.form = string.replace(tokens[0], '_', ' ')
|
|
+ tokens = line.split()
|
|
+ ints = list(map(int, tokens[int(tokens[3]) + 4:]))
|
|
+ self.form = tokens[0].replace('_', ' ')
|
|
"Orthographic representation of the word."
|
|
- self.pos = _normalizePOS(tokens[1])
|
|
+ self.pos = _normalizePOS(tokens[1])
|
|
"Part of speech. One of NOUN, VERB, ADJECTIVE, ADVERB."
|
|
- self.taggedSenseCount = ints[1]
|
|
+ self.taggedSenseCount = ints[1]
|
|
"Number of senses that are tagged."
|
|
- self._synsetOffsets = ints[2:ints[0]+2]
|
|
+ self._synsetOffsets = ints[2:ints[0]+2]
|
|
|
|
def getPointers(self, pointerType=None):
|
|
"""Pointers connect senses and synsets, not words.
|
|
@@ -233,17 +232,17 @@
|
|
raise self.getPointers.__doc__
|
|
|
|
def getSenses(self):
|
|
- """Return a sequence of senses.
|
|
-
|
|
- >>> N['dog'].getSenses()
|
|
- ('dog' in {noun: dog, domestic dog, Canis familiaris}, 'dog' in {noun: frump, dog}, 'dog' in {noun: dog}, 'dog' in {noun: cad, bounder, blackguard, dog, hound, heel}, 'dog' in {noun: frank, frankfurter, hotdog, hot dog, dog, wiener, wienerwurst, weenie}, 'dog' in {noun: pawl, detent, click, dog}, 'dog' in {noun: andiron, firedog, dog, dog-iron})
|
|
- """
|
|
- if not hasattr(self, '_senses'):
|
|
- def getSense(offset, pos=self.pos, form=self.form):
|
|
- return getSynset(pos, offset)[form]
|
|
- self._senses = tuple(map(getSense, self._synsetOffsets))
|
|
- del self._synsetOffsets
|
|
- return self._senses
|
|
+ """Return a sequence of senses.
|
|
+
|
|
+ >>> N['dog'].getSenses()
|
|
+ ('dog' in {noun: dog, domestic dog, Canis familiaris}, 'dog' in {noun: frump, dog}, 'dog' in {noun: dog}, 'dog' in {noun: cad, bounder, blackguard, dog, hound, heel}, 'dog' in {noun: frank, frankfurter, hotdog, hot dog, dog, wiener, wienerwurst, weenie}, 'dog' in {noun: pawl, detent, click, dog}, 'dog' in {noun: andiron, firedog, dog, dog-iron})
|
|
+ """
|
|
+ if not hasattr(self, '_senses'):
|
|
+ def getSense(offset, pos=self.pos, form=self.form):
|
|
+ return getSynset(pos, offset)[form]
|
|
+ self._senses = tuple(map(getSense, self._synsetOffsets))
|
|
+ del self._synsetOffsets
|
|
+ return self._senses
|
|
|
|
# Deprecated. Present for backwards compatability.
|
|
def senses(self):
|
|
@@ -255,70 +254,70 @@
|
|
return self.getSense()
|
|
|
|
def isTagged(self):
|
|
- """Return 1 if any sense is tagged.
|
|
-
|
|
- >>> N['dog'].isTagged()
|
|
- 1
|
|
- """
|
|
- return self.taggedSenseCount > 0
|
|
+ """Return 1 if any sense is tagged.
|
|
+
|
|
+ >>> N['dog'].isTagged()
|
|
+ 1
|
|
+ """
|
|
+ return self.taggedSenseCount > 0
|
|
|
|
def getAdjectivePositions(self):
|
|
- """Return a sequence of adjective positions that this word can
|
|
- appear in. These are elements of ADJECTIVE_POSITIONS.
|
|
-
|
|
- >>> ADJ['clear'].getAdjectivePositions()
|
|
- [None, 'predicative']
|
|
- """
|
|
- positions = {}
|
|
- for sense in self.getSenses():
|
|
- positions[sense.position] = 1
|
|
- return positions.keys()
|
|
+ """Return a sequence of adjective positions that this word can
|
|
+ appear in. These are elements of ADJECTIVE_POSITIONS.
|
|
+
|
|
+ >>> ADJ['clear'].getAdjectivePositions()
|
|
+ [None, 'predicative']
|
|
+ """
|
|
+ positions = {}
|
|
+ for sense in self.getSenses():
|
|
+ positions[sense.position] = 1
|
|
+ return list(positions.keys())
|
|
|
|
adjectivePositions = getAdjectivePositions # backwards compatability
|
|
|
|
def __cmp__(self, other):
|
|
- """
|
|
- >>> N['cat'] < N['dog']
|
|
- 1
|
|
- >>> N['dog'] < V['dog']
|
|
- 1
|
|
- """
|
|
- return _compareInstances(self, other, ('pos', 'form'))
|
|
+ """
|
|
+ >>> N['cat'] < N['dog']
|
|
+ 1
|
|
+ >>> N['dog'] < V['dog']
|
|
+ 1
|
|
+ """
|
|
+ return _compareInstances(self, other, ('pos', 'form'))
|
|
|
|
def __str__(self):
|
|
- """Return a human-readable representation.
|
|
-
|
|
- >>> str(N['dog'])
|
|
- 'dog(n.)'
|
|
- """
|
|
- abbrs = {NOUN: 'n.', VERB: 'v.', ADJECTIVE: 'adj.', ADVERB: 'adv.'}
|
|
- return self.form + "(" + abbrs[self.pos] + ")"
|
|
+ """Return a human-readable representation.
|
|
+
|
|
+ >>> str(N['dog'])
|
|
+ 'dog(n.)'
|
|
+ """
|
|
+ abbrs = {NOUN: 'n.', VERB: 'v.', ADJECTIVE: 'adj.', ADVERB: 'adv.'}
|
|
+ return self.form + "(" + abbrs[self.pos] + ")"
|
|
|
|
def __repr__(self):
|
|
- """If ReadableRepresentations is true, return a human-readable
|
|
- representation, e.g. 'dog(n.)'.
|
|
-
|
|
- If ReadableRepresentations is false, return a machine-readable
|
|
- representation, e.g. "getWord('dog', 'noun')".
|
|
- """
|
|
- if ReadableRepresentations:
|
|
- return str(self)
|
|
- return "getWord" + `(self.form, self.pos)`
|
|
-
|
|
+ """If ReadableRepresentations is true, return a human-readable
|
|
+ representation, e.g. 'dog(n.)'.
|
|
+
|
|
+ If ReadableRepresentations is false, return a machine-readable
|
|
+ representation, e.g. "getWord('dog', 'noun')".
|
|
+ """
|
|
+ if ReadableRepresentations:
|
|
+ return str(self)
|
|
+ return "getWord" + repr((self.form, self.pos))
|
|
+
|
|
#
|
|
# Sequence protocol (a Word's elements are its Senses)
|
|
#
|
|
- def __nonzero__(self):
|
|
- return 1
|
|
+ def __bool__(self):
|
|
+ return 1
|
|
|
|
def __len__(self):
|
|
- return len(self.getSenses())
|
|
+ return len(self.getSenses())
|
|
|
|
def __getitem__(self, index):
|
|
- return self.getSenses()[index]
|
|
+ return self.getSenses()[index]
|
|
|
|
def __getslice__(self, i, j):
|
|
- return self.getSenses()[i:j]
|
|
+ return self.getSenses()[i:j]
|
|
|
|
|
|
class Synset:
|
|
@@ -356,157 +355,157 @@
|
|
|
|
def __init__(self, pos, offset, line):
|
|
"Initialize the synset from a line off a WN synset file."
|
|
- self.pos = pos
|
|
+ self.pos = pos
|
|
"part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB."
|
|
- self.offset = offset
|
|
+ self.offset = offset
|
|
"""integer offset into the part-of-speech file. Together
|
|
with pos, this can be used as a unique id."""
|
|
- tokens = string.split(line[:string.index(line, '|')])
|
|
- self.ssType = tokens[2]
|
|
- self.gloss = string.strip(line[string.index(line, '|') + 1:])
|
|
+ tokens = line[:line.index('|')].split()
|
|
+ self.ssType = tokens[2]
|
|
+ self.gloss = line[line.index('|') + 1:].strip()
|
|
self.lexname = Lexname.lexnames[int(tokens[1])]
|
|
- (self._senseTuples, remainder) = _partition(tokens[4:], 2, string.atoi(tokens[3], 16))
|
|
- (self._pointerTuples, remainder) = _partition(remainder[1:], 4, int(remainder[0]))
|
|
- if pos == VERB:
|
|
- (vfTuples, remainder) = _partition(remainder[1:], 3, int(remainder[0]))
|
|
- def extractVerbFrames(index, vfTuples):
|
|
- return tuple(map(lambda t:string.atoi(t[1]), filter(lambda t,i=index:string.atoi(t[2],16) in (0, i), vfTuples)))
|
|
- senseVerbFrames = []
|
|
- for index in range(1, len(self._senseTuples) + 1):
|
|
- senseVerbFrames.append(extractVerbFrames(index, vfTuples))
|
|
- self._senseVerbFrames = senseVerbFrames
|
|
- self.verbFrames = tuple(extractVerbFrames(None, vfTuples))
|
|
+ (self._senseTuples, remainder) = _partition(tokens[4:], 2, int(tokens[3], 16))
|
|
+ (self._pointerTuples, remainder) = _partition(remainder[1:], 4, int(remainder[0]))
|
|
+ if pos == VERB:
|
|
+ (vfTuples, remainder) = _partition(remainder[1:], 3, int(remainder[0]))
|
|
+ def extractVerbFrames(index, vfTuples):
|
|
+ return tuple([int(t[1]) for t in list(filter(lambda t,i=index:int(t[2],16) in (0, i), vfTuples))])
|
|
+ senseVerbFrames = []
|
|
+ for index in range(1, len(self._senseTuples) + 1):
|
|
+ senseVerbFrames.append(extractVerbFrames(index, vfTuples))
|
|
+ self._senseVerbFrames = senseVerbFrames
|
|
+ self.verbFrames = tuple(extractVerbFrames(None, vfTuples))
|
|
"""A sequence of integers that index into
|
|
VERB_FRAME_STRINGS. These list the verb frames that any
|
|
Sense in this synset participates in. (See also
|
|
Sense.verbFrames.) Defined only for verbs."""
|
|
|
|
def getSenses(self):
|
|
- """Return a sequence of Senses.
|
|
-
|
|
- >>> N['dog'][0].getSenses()
|
|
- ('dog' in {noun: dog, domestic dog, Canis familiaris},)
|
|
- """
|
|
- if not hasattr(self, '_senses'):
|
|
- def loadSense(senseTuple, verbFrames=None, synset=self):
|
|
- return Sense(synset, senseTuple, verbFrames)
|
|
- if self.pos == VERB:
|
|
- self._senses = tuple(map(loadSense, self._senseTuples, self._senseVerbFrames))
|
|
- del self._senseVerbFrames
|
|
- else:
|
|
- self._senses = tuple(map(loadSense, self._senseTuples))
|
|
- del self._senseTuples
|
|
- return self._senses
|
|
+ """Return a sequence of Senses.
|
|
+
|
|
+ >>> N['dog'][0].getSenses()
|
|
+ ('dog' in {noun: dog, domestic dog, Canis familiaris},)
|
|
+ """
|
|
+ if not hasattr(self, '_senses'):
|
|
+ def loadSense(senseTuple, verbFrames=None, synset=self):
|
|
+ return Sense(synset, senseTuple, verbFrames)
|
|
+ if self.pos == VERB:
|
|
+ self._senses = tuple(map(loadSense, self._senseTuples, self._senseVerbFrames))
|
|
+ del self._senseVerbFrames
|
|
+ else:
|
|
+ self._senses = tuple(map(loadSense, self._senseTuples))
|
|
+ del self._senseTuples
|
|
+ return self._senses
|
|
|
|
senses = getSenses
|
|
|
|
def getPointers(self, pointerType=None):
|
|
- """Return a sequence of Pointers.
|
|
+ """Return a sequence of Pointers.
|
|
|
|
If pointerType is specified, only pointers of that type are
|
|
returned. In this case, pointerType should be an element of
|
|
POINTER_TYPES.
|
|
-
|
|
- >>> N['dog'][0].getPointers()[:5]
|
|
- (hypernym -> {noun: canine, canid}, member meronym -> {noun: Canis, genus Canis}, member meronym -> {noun: pack}, hyponym -> {noun: pooch, doggie, doggy, barker, bow-wow}, hyponym -> {noun: cur, mongrel, mutt})
|
|
- >>> N['dog'][0].getPointers(HYPERNYM)
|
|
- (hypernym -> {noun: canine, canid},)
|
|
- """
|
|
- if not hasattr(self, '_pointers'):
|
|
- def loadPointer(tuple, synset=self):
|
|
- return Pointer(synset.offset, tuple)
|
|
- self._pointers = tuple(map(loadPointer, self._pointerTuples))
|
|
- del self._pointerTuples
|
|
- if pointerType == None:
|
|
- return self._pointers
|
|
- else:
|
|
- _requirePointerType(pointerType)
|
|
- return filter(lambda pointer, type=pointerType: pointer.type == type, self._pointers)
|
|
+
|
|
+ >>> N['dog'][0].getPointers()[:5]
|
|
+ (hypernym -> {noun: canine, canid}, member meronym -> {noun: Canis, genus Canis}, member meronym -> {noun: pack}, hyponym -> {noun: pooch, doggie, doggy, barker, bow-wow}, hyponym -> {noun: cur, mongrel, mutt})
|
|
+ >>> N['dog'][0].getPointers(HYPERNYM)
|
|
+ (hypernym -> {noun: canine, canid},)
|
|
+ """
|
|
+ if not hasattr(self, '_pointers'):
|
|
+ def loadPointer(tuple, synset=self):
|
|
+ return Pointer(synset.offset, tuple)
|
|
+ self._pointers = tuple(map(loadPointer, self._pointerTuples))
|
|
+ del self._pointerTuples
|
|
+ if pointerType == None:
|
|
+ return self._pointers
|
|
+ else:
|
|
+ _requirePointerType(pointerType)
|
|
+ return list(filter(lambda pointer, type=pointerType: pointer.type == type, self._pointers))
|
|
|
|
pointers = getPointers # backwards compatability
|
|
|
|
def getPointerTargets(self, pointerType=None):
|
|
- """Return a sequence of Senses or Synsets.
|
|
-
|
|
+ """Return a sequence of Senses or Synsets.
|
|
+
|
|
If pointerType is specified, only targets of pointers of that
|
|
type are returned. In this case, pointerType should be an
|
|
element of POINTER_TYPES.
|
|
-
|
|
- >>> N['dog'][0].getPointerTargets()[:5]
|
|
- [{noun: canine, canid}, {noun: Canis, genus Canis}, {noun: pack}, {noun: pooch, doggie, doggy, barker, bow-wow}, {noun: cur, mongrel, mutt}]
|
|
- >>> N['dog'][0].getPointerTargets(HYPERNYM)
|
|
- [{noun: canine, canid}]
|
|
- """
|
|
- return map(Pointer.target, self.getPointers(pointerType))
|
|
+
|
|
+ >>> N['dog'][0].getPointerTargets()[:5]
|
|
+ [{noun: canine, canid}, {noun: Canis, genus Canis}, {noun: pack}, {noun: pooch, doggie, doggy, barker, bow-wow}, {noun: cur, mongrel, mutt}]
|
|
+ >>> N['dog'][0].getPointerTargets(HYPERNYM)
|
|
+ [{noun: canine, canid}]
|
|
+ """
|
|
+ return list(map(Pointer.target, self.getPointers(pointerType)))
|
|
|
|
pointerTargets = getPointerTargets # backwards compatability
|
|
|
|
def isTagged(self):
|
|
- """Return 1 if any sense is tagged.
|
|
-
|
|
- >>> N['dog'][0].isTagged()
|
|
- 1
|
|
- >>> N['dog'][1].isTagged()
|
|
- 0
|
|
- """
|
|
- return len(filter(Sense.isTagged, self.getSenses())) > 0
|
|
+ """Return 1 if any sense is tagged.
|
|
+
|
|
+ >>> N['dog'][0].isTagged()
|
|
+ 1
|
|
+ >>> N['dog'][1].isTagged()
|
|
+ 0
|
|
+ """
|
|
+ return len(list(filter(Sense.isTagged, self.getSenses()))) > 0
|
|
|
|
def __str__(self):
|
|
- """Return a human-readable representation.
|
|
-
|
|
- >>> str(N['dog'][0].synset)
|
|
- '{noun: dog, domestic dog, Canis familiaris}'
|
|
- """
|
|
- return "{" + self.pos + ": " + string.joinfields(map(lambda sense:sense.form, self.getSenses()), ", ") + "}"
|
|
+ """Return a human-readable representation.
|
|
+
|
|
+ >>> str(N['dog'][0].synset)
|
|
+ '{noun: dog, domestic dog, Canis familiaris}'
|
|
+ """
|
|
+ return "{" + self.pos + ": " + string.joinfields([sense.form for sense in self.getSenses()], ", ") + "}"
|
|
|
|
def __repr__(self):
|
|
- """If ReadableRepresentations is true, return a human-readable
|
|
- representation, e.g. 'dog(n.)'.
|
|
-
|
|
- If ReadableRepresentations is false, return a machine-readable
|
|
- representation, e.g. "getSynset(pos, 1234)".
|
|
- """
|
|
- if ReadableRepresentations:
|
|
- return str(self)
|
|
- return "getSynset" + `(self.pos, self.offset)`
|
|
+ """If ReadableRepresentations is true, return a human-readable
|
|
+ representation, e.g. 'dog(n.)'.
|
|
+
|
|
+ If ReadableRepresentations is false, return a machine-readable
|
|
+ representation, e.g. "getSynset(pos, 1234)".
|
|
+ """
|
|
+ if ReadableRepresentations:
|
|
+ return str(self)
|
|
+ return "getSynset" + repr((self.pos, self.offset))
|
|
|
|
def __cmp__(self, other):
|
|
- return _compareInstances(self, other, ('pos', 'offset'))
|
|
+ return _compareInstances(self, other, ('pos', 'offset'))
|
|
|
|
#
|
|
# Sequence protocol (a Synset's elements are its senses).
|
|
#
|
|
- def __nonzero__(self):
|
|
- return 1
|
|
+ def __bool__(self):
|
|
+ return 1
|
|
|
|
def __len__(self):
|
|
- """
|
|
- >>> len(N['dog'][0].synset)
|
|
- 3
|
|
- """
|
|
- return len(self.getSenses())
|
|
+ """
|
|
+ >>> len(N['dog'][0].synset)
|
|
+ 3
|
|
+ """
|
|
+ return len(self.getSenses())
|
|
|
|
def __getitem__(self, idx):
|
|
- """
|
|
- >>> N['dog'][0].synset[0] == N['dog'][0]
|
|
- 1
|
|
- >>> N['dog'][0].synset['dog'] == N['dog'][0]
|
|
- 1
|
|
- >>> N['dog'][0].synset[N['dog']] == N['dog'][0]
|
|
- 1
|
|
- >>> N['cat'][6]
|
|
- 'cat' in {noun: big cat, cat}
|
|
- """
|
|
- senses = self.getSenses()
|
|
- if isinstance(idx, Word):
|
|
- idx = idx.form
|
|
- if isinstance(idx, StringType):
|
|
- idx = _index(idx, map(lambda sense:sense.form, senses)) or \
|
|
- _index(idx, map(lambda sense:sense.form, senses), _equalsIgnoreCase)
|
|
- return senses[idx]
|
|
+ """
|
|
+ >>> N['dog'][0].synset[0] == N['dog'][0]
|
|
+ 1
|
|
+ >>> N['dog'][0].synset['dog'] == N['dog'][0]
|
|
+ 1
|
|
+ >>> N['dog'][0].synset[N['dog']] == N['dog'][0]
|
|
+ 1
|
|
+ >>> N['cat'][6]
|
|
+ 'cat' in {noun: big cat, cat}
|
|
+ """
|
|
+ senses = self.getSenses()
|
|
+ if isinstance(idx, Word):
|
|
+ idx = idx.form
|
|
+ if isinstance(idx, str):
|
|
+ idx = _index(idx, [sense.form for sense in senses]) or \
|
|
+ _index(idx, [sense.form for sense in senses], _equalsIgnoreCase)
|
|
+ return senses[idx]
|
|
|
|
def __getslice__(self, i, j):
|
|
- return self.getSenses()[i:j]
|
|
+ return self.getSenses()[i:j]
|
|
|
|
|
|
class Sense:
|
|
@@ -527,7 +526,7 @@
|
|
VERB_FRAME_STRINGS. These list the verb frames that this
|
|
Sense partipates in. Defined only for verbs.
|
|
|
|
- >>> decide = V['decide'][0].synset # first synset for 'decide'
|
|
+ >>> decide = V['decide'][0].synset # first synset for 'decide'
|
|
>>> decide[0].verbFrames
|
|
(8, 2, 26, 29)
|
|
>>> decide[1].verbFrames
|
|
@@ -538,124 +537,124 @@
|
|
|
|
def __init__(sense, synset, senseTuple, verbFrames=None):
|
|
"Initialize a sense from a synset's senseTuple."
|
|
- # synset is stored by key (pos, synset) rather than object
|
|
- # reference, to avoid creating a circular reference between
|
|
- # Senses and Synsets that will prevent the vm from
|
|
- # garbage-collecting them.
|
|
- sense.pos = synset.pos
|
|
+ # synset is stored by key (pos, synset) rather than object
|
|
+ # reference, to avoid creating a circular reference between
|
|
+ # Senses and Synsets that will prevent the vm from
|
|
+ # garbage-collecting them.
|
|
+ sense.pos = synset.pos
|
|
"part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB"
|
|
- sense.synsetOffset = synset.offset
|
|
+ sense.synsetOffset = synset.offset
|
|
"synset key. This is used to retrieve the sense."
|
|
- sense.verbFrames = verbFrames
|
|
+ sense.verbFrames = verbFrames
|
|
"""A sequence of integers that index into
|
|
VERB_FRAME_STRINGS. These list the verb frames that this
|
|
Sense partipates in. Defined only for verbs."""
|
|
- (form, idString) = senseTuple
|
|
- sense.position = None
|
|
- if '(' in form:
|
|
- index = string.index(form, '(')
|
|
- key = form[index + 1:-1]
|
|
- form = form[:index]
|
|
- if key == 'a':
|
|
- sense.position = ATTRIBUTIVE
|
|
- elif key == 'p':
|
|
- sense.position = PREDICATIVE
|
|
- elif key == 'ip':
|
|
- sense.position = IMMEDIATE_POSTNOMINAL
|
|
- else:
|
|
- raise "unknown attribute " + key
|
|
- sense.form = string.replace(form, '_', ' ')
|
|
+ (form, idString) = senseTuple
|
|
+ sense.position = None
|
|
+ if '(' in form:
|
|
+ index = form.index('(')
|
|
+ key = form[index + 1:-1]
|
|
+ form = form[:index]
|
|
+ if key == 'a':
|
|
+ sense.position = ATTRIBUTIVE
|
|
+ elif key == 'p':
|
|
+ sense.position = PREDICATIVE
|
|
+ elif key == 'ip':
|
|
+ sense.position = IMMEDIATE_POSTNOMINAL
|
|
+ else:
|
|
+ raise "unknown attribute " + key
|
|
+ sense.form = form.replace('_', ' ')
|
|
"orthographic representation of the Word this is a Sense of."
|
|
|
|
def __getattr__(self, name):
|
|
- # see the note at __init__ about why 'synset' is provided as a
|
|
- # 'virtual' slot
|
|
- if name == 'synset':
|
|
- return getSynset(self.pos, self.synsetOffset)
|
|
+ # see the note at __init__ about why 'synset' is provided as a
|
|
+ # 'virtual' slot
|
|
+ if name == 'synset':
|
|
+ return getSynset(self.pos, self.synsetOffset)
|
|
elif name == 'lexname':
|
|
return self.synset.lexname
|
|
- else:
|
|
- raise AttributeError, name
|
|
+ else:
|
|
+ raise AttributeError(name)
|
|
|
|
def __str__(self):
|
|
- """Return a human-readable representation.
|
|
-
|
|
- >>> str(N['dog'])
|
|
- 'dog(n.)'
|
|
- """
|
|
- return `self.form` + " in " + str(self.synset)
|
|
+ """Return a human-readable representation.
|
|
+
|
|
+ >>> str(N['dog'])
|
|
+ 'dog(n.)'
|
|
+ """
|
|
+ return repr(self.form) + " in " + str(self.synset)
|
|
|
|
def __repr__(self):
|
|
- """If ReadableRepresentations is true, return a human-readable
|
|
- representation, e.g. 'dog(n.)'.
|
|
-
|
|
- If ReadableRepresentations is false, return a machine-readable
|
|
- representation, e.g. "getWord('dog', 'noun')".
|
|
- """
|
|
- if ReadableRepresentations:
|
|
- return str(self)
|
|
- return "%s[%s]" % (`self.synset`, `self.form`)
|
|
+ """If ReadableRepresentations is true, return a human-readable
|
|
+ representation, e.g. 'dog(n.)'.
|
|
+
|
|
+ If ReadableRepresentations is false, return a machine-readable
|
|
+ representation, e.g. "getWord('dog', 'noun')".
|
|
+ """
|
|
+ if ReadableRepresentations:
|
|
+ return str(self)
|
|
+ return "%s[%s]" % (repr(self.synset), repr(self.form))
|
|
|
|
def getPointers(self, pointerType=None):
|
|
- """Return a sequence of Pointers.
|
|
-
|
|
+ """Return a sequence of Pointers.
|
|
+
|
|
If pointerType is specified, only pointers of that type are
|
|
returned. In this case, pointerType should be an element of
|
|
POINTER_TYPES.
|
|
-
|
|
- >>> N['dog'][0].getPointers()[:5]
|
|
- (hypernym -> {noun: canine, canid}, member meronym -> {noun: Canis, genus Canis}, member meronym -> {noun: pack}, hyponym -> {noun: pooch, doggie, doggy, barker, bow-wow}, hyponym -> {noun: cur, mongrel, mutt})
|
|
- >>> N['dog'][0].getPointers(HYPERNYM)
|
|
- (hypernym -> {noun: canine, canid},)
|
|
- """
|
|
- senseIndex = _index(self, self.synset.getSenses())
|
|
- def pointsFromThisSense(pointer, selfIndex=senseIndex):
|
|
- return pointer.sourceIndex == 0 or pointer.sourceIndex - 1 == selfIndex
|
|
- return filter(pointsFromThisSense, self.synset.getPointers(pointerType))
|
|
+
|
|
+ >>> N['dog'][0].getPointers()[:5]
|
|
+ (hypernym -> {noun: canine, canid}, member meronym -> {noun: Canis, genus Canis}, member meronym -> {noun: pack}, hyponym -> {noun: pooch, doggie, doggy, barker, bow-wow}, hyponym -> {noun: cur, mongrel, mutt})
|
|
+ >>> N['dog'][0].getPointers(HYPERNYM)
|
|
+ (hypernym -> {noun: canine, canid},)
|
|
+ """
|
|
+ senseIndex = _index(self, self.synset.getSenses())
|
|
+ def pointsFromThisSense(pointer, selfIndex=senseIndex):
|
|
+ return pointer.sourceIndex == 0 or pointer.sourceIndex - 1 == selfIndex
|
|
+ return list(filter(pointsFromThisSense, self.synset.getPointers(pointerType)))
|
|
|
|
pointers = getPointers # backwards compatability
|
|
|
|
def getPointerTargets(self, pointerType=None):
|
|
- """Return a sequence of Senses or Synsets.
|
|
-
|
|
+ """Return a sequence of Senses or Synsets.
|
|
+
|
|
If pointerType is specified, only targets of pointers of that
|
|
type are returned. In this case, pointerType should be an
|
|
element of POINTER_TYPES.
|
|
-
|
|
- >>> N['dog'][0].getPointerTargets()[:5]
|
|
- [{noun: canine, canid}, {noun: Canis, genus Canis}, {noun: pack}, {noun: pooch, doggie, doggy, barker, bow-wow}, {noun: cur, mongrel, mutt}]
|
|
- >>> N['dog'][0].getPointerTargets(HYPERNYM)
|
|
- [{noun: canine, canid}]
|
|
- """
|
|
- return map(Pointer.target, self.getPointers(pointerType))
|
|
+
|
|
+ >>> N['dog'][0].getPointerTargets()[:5]
|
|
+ [{noun: canine, canid}, {noun: Canis, genus Canis}, {noun: pack}, {noun: pooch, doggie, doggy, barker, bow-wow}, {noun: cur, mongrel, mutt}]
|
|
+ >>> N['dog'][0].getPointerTargets(HYPERNYM)
|
|
+ [{noun: canine, canid}]
|
|
+ """
|
|
+ return list(map(Pointer.target, self.getPointers(pointerType)))
|
|
|
|
pointerTargets = getPointerTargets # backwards compatability
|
|
|
|
def getSenses(self):
|
|
- return self,
|
|
+ return self,
|
|
|
|
senses = getSenses # backwards compatability
|
|
|
|
def isTagged(self):
|
|
- """Return 1 if any sense is tagged.
|
|
-
|
|
- >>> N['dog'][0].isTagged()
|
|
- 1
|
|
- >>> N['dog'][1].isTagged()
|
|
- 0
|
|
- """
|
|
- word = self.word()
|
|
- return _index(self, word.getSenses()) < word.taggedSenseCount
|
|
+ """Return 1 if any sense is tagged.
|
|
+
|
|
+ >>> N['dog'][0].isTagged()
|
|
+ 1
|
|
+ >>> N['dog'][1].isTagged()
|
|
+ 0
|
|
+ """
|
|
+ word = self.word()
|
|
+ return _index(self, word.getSenses()) < word.taggedSenseCount
|
|
|
|
def getWord(self):
|
|
- return getWord(self.form, self.pos)
|
|
+ return getWord(self.form, self.pos)
|
|
|
|
word = getWord # backwards compatability
|
|
|
|
def __cmp__(self, other):
|
|
- def senseIndex(sense, synset=self.synset):
|
|
- return _index(sense, synset.getSenses(), testfn=lambda a,b: a.form == b.form)
|
|
- return _compareInstances(self, other, ('synset',)) or cmp(senseIndex(self), senseIndex(other))
|
|
+ def senseIndex(sense, synset=self.synset):
|
|
+ return _index(sense, synset.getSenses(), testfn=lambda a,b: a.form == b.form)
|
|
+ return _compareInstances(self, other, ('synset',)) or cmp(senseIndex(self), senseIndex(other))
|
|
|
|
|
|
class Pointer:
|
|
@@ -670,21 +669,21 @@
|
|
"""
|
|
|
|
_POINTER_TYPE_TABLE = {
|
|
- '!': ANTONYM,
|
|
+ '!': ANTONYM,
|
|
'@': HYPERNYM,
|
|
'~': HYPONYM,
|
|
- '=': ATTRIBUTE,
|
|
+ '=': ATTRIBUTE,
|
|
'^': ALSO_SEE,
|
|
'*': ENTAILMENT,
|
|
'>': CAUSE,
|
|
- '$': VERB_GROUP,
|
|
- '#m': MEMBER_MERONYM,
|
|
+ '$': VERB_GROUP,
|
|
+ '#m': MEMBER_MERONYM,
|
|
'#s': SUBSTANCE_MERONYM,
|
|
'#p': PART_MERONYM,
|
|
- '%m': MEMBER_HOLONYM,
|
|
+ '%m': MEMBER_HOLONYM,
|
|
'%s': SUBSTANCE_HOLONYM,
|
|
'%p': PART_HOLONYM,
|
|
- '&': SIMILAR,
|
|
+ '&': SIMILAR,
|
|
'<': PARTICIPLE_OF,
|
|
'\\': PERTAINYM,
|
|
# New in wn 2.0:
|
|
@@ -698,57 +697,57 @@
|
|
}
|
|
|
|
def __init__(self, sourceOffset, pointerTuple):
|
|
- (type, offset, pos, indices) = pointerTuple
|
|
- # dnaber: try to adapt to WordNet 2.1:
|
|
- if type == "@i":
|
|
- type = "@"
|
|
- if type == "~i":
|
|
- type = "~"
|
|
- # /dnaber
|
|
- self.type = Pointer._POINTER_TYPE_TABLE[type]
|
|
+ (type, offset, pos, indices) = pointerTuple
|
|
+ # dnaber: try to adapt to WordNet 2.1:
|
|
+ if type == "@i":
|
|
+ type = "@"
|
|
+ if type == "~i":
|
|
+ type = "~"
|
|
+ # /dnaber
|
|
+ self.type = Pointer._POINTER_TYPE_TABLE[type]
|
|
"""One of POINTER_TYPES."""
|
|
- self.sourceOffset = sourceOffset
|
|
- self.targetOffset = int(offset)
|
|
- self.pos = _normalizePOS(pos)
|
|
+ self.sourceOffset = sourceOffset
|
|
+ self.targetOffset = int(offset)
|
|
+ self.pos = _normalizePOS(pos)
|
|
"""part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB"""
|
|
- indices = string.atoi(indices, 16)
|
|
- self.sourceIndex = indices >> 8
|
|
- self.targetIndex = indices & 255
|
|
+ indices = int(indices, 16)
|
|
+ self.sourceIndex = indices >> 8
|
|
+ self.targetIndex = indices & 255
|
|
|
|
def getSource(self):
|
|
- synset = getSynset(self.pos, self.sourceOffset)
|
|
- if self.sourceIndex:
|
|
- return synset[self.sourceIndex - 1]
|
|
- else:
|
|
- return synset
|
|
+ synset = getSynset(self.pos, self.sourceOffset)
|
|
+ if self.sourceIndex:
|
|
+ return synset[self.sourceIndex - 1]
|
|
+ else:
|
|
+ return synset
|
|
|
|
source = getSource # backwards compatability
|
|
|
|
def getTarget(self):
|
|
- synset = getSynset(self.pos, self.targetOffset)
|
|
- if self.targetIndex:
|
|
- return synset[self.targetIndex - 1]
|
|
- else:
|
|
- return synset
|
|
+ synset = getSynset(self.pos, self.targetOffset)
|
|
+ if self.targetIndex:
|
|
+ return synset[self.targetIndex - 1]
|
|
+ else:
|
|
+ return synset
|
|
|
|
target = getTarget # backwards compatability
|
|
|
|
def __str__(self):
|
|
- return self.type + " -> " + str(self.target())
|
|
+ return self.type + " -> " + str(self.target())
|
|
|
|
def __repr__(self):
|
|
- if ReadableRepresentations:
|
|
- return str(self)
|
|
- return "<" + str(self) + ">"
|
|
+ if ReadableRepresentations:
|
|
+ return str(self)
|
|
+ return "<" + str(self) + ">"
|
|
|
|
def __cmp__(self, other):
|
|
- diff = _compareInstances(self, other, ('pos', 'sourceOffset'))
|
|
- if diff:
|
|
- return diff
|
|
- synset = self.source()
|
|
- def pointerIndex(sense, synset=synset):
|
|
- return _index(sense, synset.getPointers(), testfn=lambda a,b: not _compareInstances(a, b, ('type', 'sourceIndex', 'targetIndex')))
|
|
- return cmp(pointerIndex(self), pointerIndex(other))
|
|
+ diff = _compareInstances(self, other, ('pos', 'sourceOffset'))
|
|
+ if diff:
|
|
+ return diff
|
|
+ synset = self.source()
|
|
+ def pointerIndex(sense, synset=synset):
|
|
+ return _index(sense, synset.getPointers(), testfn=lambda a,b: not _compareInstances(a, b, ('type', 'sourceIndex', 'targetIndex')))
|
|
+ return cmp(pointerIndex(self), pointerIndex(other))
|
|
|
|
|
|
# Loading the lexnames
|
|
@@ -769,7 +768,7 @@
|
|
|
|
def setupLexnames():
|
|
for l in open(WNSEARCHDIR+'/lexnames').readlines():
|
|
- i,name,category = string.split(l)
|
|
+ i,name,category = l.split()
|
|
Lexname(name,PartsOfSpeech[int(category)-1])
|
|
|
|
setupLexnames()
|
|
@@ -802,59 +801,59 @@
|
|
"""
|
|
|
|
def __init__(self, pos, filenameroot):
|
|
- self.pos = pos
|
|
+ self.pos = pos
|
|
"""part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB"""
|
|
- self.indexFile = _IndexFile(pos, filenameroot)
|
|
- self.dataFile = open(_dataFilePathname(filenameroot), _FILE_OPEN_MODE)
|
|
+ self.indexFile = _IndexFile(pos, filenameroot)
|
|
+ self.dataFile = open(_dataFilePathname(filenameroot), _FILE_OPEN_MODE)
|
|
|
|
def __repr__(self):
|
|
- dictionaryVariables = {N: 'N', V: 'V', ADJ: 'ADJ', ADV: 'ADV'}
|
|
- if dictionaryVariables.get(self):
|
|
- return self.__module__ + "." + dictionaryVariables[self]
|
|
- return "<%s.%s instance for %s>" % (self.__module__, "Dictionary", self.pos)
|
|
+ dictionaryVariables = {N: 'N', V: 'V', ADJ: 'ADJ', ADV: 'ADV'}
|
|
+ if dictionaryVariables.get(self):
|
|
+ return self.__module__ + "." + dictionaryVariables[self]
|
|
+ return "<%s.%s instance for %s>" % (self.__module__, "Dictionary", self.pos)
|
|
|
|
def getWord(self, form, line=None):
|
|
- key = string.replace(string.lower(form), ' ', '_')
|
|
- pos = self.pos
|
|
- def loader(key=key, line=line, indexFile=self.indexFile):
|
|
- line = line or indexFile.get(key)
|
|
- return line and Word(line)
|
|
- word = _entityCache.get((pos, key), loader)
|
|
- if word:
|
|
- return word
|
|
- else:
|
|
- raise KeyError, "%s is not in the %s database" % (`form`, `pos`)
|
|
+ key = form.lower().replace(' ', '_')
|
|
+ pos = self.pos
|
|
+ def loader(key=key, line=line, indexFile=self.indexFile):
|
|
+ line = line or indexFile.get(key)
|
|
+ return line and Word(line)
|
|
+ word = _entityCache.get((pos, key), loader)
|
|
+ if word != None:
|
|
+ return word
|
|
+ else:
|
|
+ raise KeyError("%s is not in the %s database" % (repr(form), repr(pos)))
|
|
|
|
def getSynset(self, offset):
|
|
- pos = self.pos
|
|
- def loader(pos=pos, offset=offset, dataFile=self.dataFile):
|
|
- return Synset(pos, offset, _lineAt(dataFile, offset))
|
|
- return _entityCache.get((pos, offset), loader)
|
|
+ pos = self.pos
|
|
+ def loader(pos=pos, offset=offset, dataFile=self.dataFile):
|
|
+ return Synset(pos, offset, _lineAt(dataFile, offset))
|
|
+ return _entityCache.get((pos, offset), loader)
|
|
|
|
def _buildIndexCacheFile(self):
|
|
- self.indexFile._buildIndexCacheFile()
|
|
+ self.indexFile._buildIndexCacheFile()
|
|
|
|
#
|
|
# Sequence protocol (a Dictionary's items are its Words)
|
|
#
|
|
- def __nonzero__(self):
|
|
- """Return false. (This is to avoid scanning the whole index file
|
|
- to compute len when a Dictionary is used in test position.)
|
|
-
|
|
- >>> N and 'true'
|
|
- 'true'
|
|
- """
|
|
- return 1
|
|
+ def __bool__(self):
|
|
+ """Return false. (This is to avoid scanning the whole index file
|
|
+ to compute len when a Dictionary is used in test position.)
|
|
+
|
|
+ >>> N and 'true'
|
|
+ 'true'
|
|
+ """
|
|
+ return 1
|
|
|
|
def __len__(self):
|
|
- """Return the number of index entries.
|
|
-
|
|
- >>> len(ADJ)
|
|
- 21435
|
|
- """
|
|
- if not hasattr(self, 'length'):
|
|
- self.length = len(self.indexFile)
|
|
- return self.length
|
|
+ """Return the number of index entries.
|
|
+
|
|
+ >>> len(ADJ)
|
|
+ 21435
|
|
+ """
|
|
+ if not hasattr(self, 'length'):
|
|
+ self.length = len(self.indexFile)
|
|
+ return self.length
|
|
|
|
def __getslice__(self, a, b):
|
|
results = []
|
|
@@ -868,22 +867,22 @@
|
|
return results
|
|
|
|
def __getitem__(self, index):
|
|
- """If index is a String, return the Word whose form is
|
|
- index. If index is an integer n, return the Word
|
|
- indexed by the n'th Word in the Index file.
|
|
-
|
|
- >>> N['dog']
|
|
- dog(n.)
|
|
- >>> N[0]
|
|
- 'hood(n.)
|
|
- """
|
|
- if isinstance(index, StringType):
|
|
- return self.getWord(index)
|
|
- elif isinstance(index, IntType):
|
|
- line = self.indexFile[index]
|
|
- return self.getWord(string.replace(line[:string.find(line, ' ')], '_', ' '), line)
|
|
- else:
|
|
- raise TypeError, "%s is not a String or Int" % `index`
|
|
+ """If index is a String, return the Word whose form is
|
|
+ index. If index is an integer n, return the Word
|
|
+ indexed by the n'th Word in the Index file.
|
|
+
|
|
+ >>> N['dog']
|
|
+ dog(n.)
|
|
+ >>> N[0]
|
|
+ 'hood(n.)
|
|
+ """
|
|
+ if isinstance(index, str):
|
|
+ return self.getWord(index)
|
|
+ elif isinstance(index, int):
|
|
+ line = self.indexFile[index]
|
|
+ return self.getWord(string.replace(line[:string.find(line, ' ')], '_', ' '), line)
|
|
+ else:
|
|
+ raise TypeError("%s is not a String or Int" % repr(index))
|
|
|
|
#
|
|
# Dictionary protocol
|
|
@@ -892,54 +891,54 @@
|
|
#
|
|
|
|
def get(self, key, default=None):
|
|
- """Return the Word whose form is _key_, or _default_.
|
|
-
|
|
- >>> N.get('dog')
|
|
- dog(n.)
|
|
- >>> N.get('inu')
|
|
- """
|
|
- try:
|
|
- return self[key]
|
|
- except LookupError:
|
|
- return default
|
|
+ """Return the Word whose form is _key_, or _default_.
|
|
+
|
|
+ >>> N.get('dog')
|
|
+ dog(n.)
|
|
+ >>> N.get('inu')
|
|
+ """
|
|
+ try:
|
|
+ return self[key]
|
|
+ except LookupError:
|
|
+ return default
|
|
|
|
def keys(self):
|
|
- """Return a sorted list of strings that index words in this
|
|
- dictionary."""
|
|
- return self.indexFile.keys()
|
|
+ """Return a sorted list of strings that index words in this
|
|
+ dictionary."""
|
|
+ return list(self.indexFile.keys())
|
|
|
|
def has_key(self, form):
|
|
- """Return true iff the argument indexes a word in this dictionary.
|
|
-
|
|
- >>> N.has_key('dog')
|
|
- 1
|
|
- >>> N.has_key('inu')
|
|
- 0
|
|
- """
|
|
- return self.indexFile.has_key(form)
|
|
+ """Return true iff the argument indexes a word in this dictionary.
|
|
+
|
|
+ >>> N.has_key('dog')
|
|
+ 1
|
|
+ >>> N.has_key('inu')
|
|
+ 0
|
|
+ """
|
|
+ return form in self.indexFile
|
|
|
|
#
|
|
# Testing
|
|
#
|
|
|
|
def _testKeys(self):
|
|
- """Verify that index lookup can find each word in the index file."""
|
|
- print "Testing: ", self
|
|
- file = open(self.indexFile.file.name, _FILE_OPEN_MODE)
|
|
- counter = 0
|
|
- while 1:
|
|
- line = file.readline()
|
|
- if line == '': break
|
|
- if line[0] != ' ':
|
|
- key = string.replace(line[:string.find(line, ' ')], '_', ' ')
|
|
- if (counter % 1000) == 0:
|
|
- print "%s..." % (key,),
|
|
- import sys
|
|
- sys.stdout.flush()
|
|
- counter = counter + 1
|
|
- self[key]
|
|
- file.close()
|
|
- print "done."
|
|
+ """Verify that index lookup can find each word in the index file."""
|
|
+ print("Testing: ", self)
|
|
+ file = open(self.indexFile.file.name, _FILE_OPEN_MODE)
|
|
+ counter = 0
|
|
+ while 1:
|
|
+ line = file.readline()
|
|
+ if line == '': break
|
|
+ if line[0] != ' ':
|
|
+ key = string.replace(line[:string.find(line, ' ')], '_', ' ')
|
|
+ if (counter % 1000) == 0:
|
|
+ print("%s..." % (key,), end=' ')
|
|
+ import sys
|
|
+ sys.stdout.flush()
|
|
+ counter = counter + 1
|
|
+ self[key]
|
|
+ file.close()
|
|
+ print("done.")
|
|
|
|
|
|
class _IndexFile:
|
|
@@ -947,69 +946,69 @@
|
|
Sequence and Dictionary interface to a sorted index file."""
|
|
|
|
def __init__(self, pos, filenameroot):
|
|
- self.pos = pos
|
|
- self.file = open(_indexFilePathname(filenameroot), _FILE_OPEN_MODE)
|
|
- self.offsetLineCache = {} # Table of (pathname, offset) -> (line, nextOffset)
|
|
- self.rewind()
|
|
- self.shelfname = os.path.join(WNSEARCHDIR, pos + ".pyidx")
|
|
- try:
|
|
- import shelve
|
|
- self.indexCache = shelve.open(self.shelfname, 'r')
|
|
- except:
|
|
- pass
|
|
+ self.pos = pos
|
|
+ self.file = open(_indexFilePathname(filenameroot), _FILE_OPEN_MODE)
|
|
+ self.offsetLineCache = {} # Table of (pathname, offset) -> (line, nextOffset)
|
|
+ self.rewind()
|
|
+ self.shelfname = os.path.join(WNSEARCHDIR, pos + ".pyidx")
|
|
+ try:
|
|
+ import shelve
|
|
+ self.indexCache = shelve.open(self.shelfname, 'r')
|
|
+ except:
|
|
+ pass
|
|
|
|
def rewind(self):
|
|
- self.file.seek(0)
|
|
- while 1:
|
|
- offset = self.file.tell()
|
|
- line = self.file.readline()
|
|
- if (line[0] != ' '):
|
|
- break
|
|
- self.nextIndex = 0
|
|
- self.nextOffset = offset
|
|
+ self.file.seek(0)
|
|
+ while 1:
|
|
+ offset = self.file.tell()
|
|
+ line = self.file.readline()
|
|
+ if (line[0] != ' '):
|
|
+ break
|
|
+ self.nextIndex = 0
|
|
+ self.nextOffset = offset
|
|
|
|
#
|
|
# Sequence protocol (an _IndexFile's items are its lines)
|
|
#
|
|
- def __nonzero__(self):
|
|
- return 1
|
|
+ def __bool__(self):
|
|
+ return 1
|
|
|
|
def __len__(self):
|
|
- if hasattr(self, 'indexCache'):
|
|
- return len(self.indexCache)
|
|
- self.rewind()
|
|
- lines = 0
|
|
- while 1:
|
|
- line = self.file.readline()
|
|
- if line == "":
|
|
- break
|
|
- lines = lines + 1
|
|
- return lines
|
|
+ if hasattr(self, 'indexCache'):
|
|
+ return len(self.indexCache)
|
|
+ self.rewind()
|
|
+ lines = 0
|
|
+ while 1:
|
|
+ line = self.file.readline()
|
|
+ if line == "":
|
|
+ break
|
|
+ lines = lines + 1
|
|
+ return lines
|
|
|
|
- def __nonzero__(self):
|
|
- return 1
|
|
+ def __bool__(self):
|
|
+ return 1
|
|
|
|
def __getitem__(self, index):
|
|
- if isinstance(index, StringType):
|
|
- if hasattr(self, 'indexCache'):
|
|
- return self.indexCache[index]
|
|
- return binarySearchFile(self.file, index, self.offsetLineCache, 8)
|
|
- elif isinstance(index, IntType):
|
|
- if hasattr(self, 'indexCache'):
|
|
- return self.get(self.keys[index])
|
|
- if index < self.nextIndex:
|
|
- self.rewind()
|
|
- while self.nextIndex <= index:
|
|
- self.file.seek(self.nextOffset)
|
|
- line = self.file.readline()
|
|
- if line == "":
|
|
- raise IndexError, "index out of range"
|
|
- self.nextIndex = self.nextIndex + 1
|
|
- self.nextOffset = self.file.tell()
|
|
- return line
|
|
- else:
|
|
- raise TypeError, "%s is not a String or Int" % `index`
|
|
-
|
|
+ if isinstance(index, str):
|
|
+ if hasattr(self, 'indexCache'):
|
|
+ return self.indexCache[index]
|
|
+ return binarySearchFile(self.file, index, self.offsetLineCache, 8)
|
|
+ elif isinstance(index, int):
|
|
+ if hasattr(self, 'indexCache'):
|
|
+ return self.get(self.keys[index])
|
|
+ if index < self.nextIndex:
|
|
+ self.rewind()
|
|
+ while self.nextIndex <= index:
|
|
+ self.file.seek(self.nextOffset)
|
|
+ line = self.file.readline()
|
|
+ if line == "":
|
|
+ raise IndexError("index out of range")
|
|
+ self.nextIndex = self.nextIndex + 1
|
|
+ self.nextOffset = self.file.tell()
|
|
+ return line
|
|
+ else:
|
|
+ raise TypeError("%s is not a String or Int" % repr(index))
|
|
+
|
|
#
|
|
# Dictionary protocol
|
|
#
|
|
@@ -1017,62 +1016,62 @@
|
|
#
|
|
|
|
def get(self, key, default=None):
|
|
- try:
|
|
- return self[key]
|
|
- except LookupError:
|
|
- return default
|
|
+ try:
|
|
+ return self[key]
|
|
+ except LookupError:
|
|
+ return default
|
|
|
|
def keys(self):
|
|
- if hasattr(self, 'indexCache'):
|
|
- keys = self.indexCache.keys()
|
|
- keys.sort()
|
|
- return keys
|
|
- else:
|
|
- keys = []
|
|
- self.rewind()
|
|
- while 1:
|
|
- line = self.file.readline()
|
|
- if not line: break
|
|
+ if hasattr(self, 'indexCache'):
|
|
+ keys = list(self.indexCache.keys())
|
|
+ keys.sort()
|
|
+ return keys
|
|
+ else:
|
|
+ keys = []
|
|
+ self.rewind()
|
|
+ while 1:
|
|
+ line = self.file.readline()
|
|
+ if not line: break
|
|
key = line.split(' ', 1)[0]
|
|
- keys.append(key.replace('_', ' '))
|
|
- return keys
|
|
+ keys.append(key.replace('_', ' '))
|
|
+ return keys
|
|
|
|
def has_key(self, key):
|
|
- key = key.replace(' ', '_') # test case: V['haze over']
|
|
- if hasattr(self, 'indexCache'):
|
|
- return self.indexCache.has_key(key)
|
|
- return self.get(key) != None
|
|
+ key = key.replace(' ', '_') # test case: V['haze over']
|
|
+ if hasattr(self, 'indexCache'):
|
|
+ return key in self.indexCache
|
|
+ return self.get(key) != None
|
|
|
|
#
|
|
# Index file
|
|
#
|
|
|
|
def _buildIndexCacheFile(self):
|
|
- import shelve
|
|
- import os
|
|
- print "Building %s:" % (self.shelfname,),
|
|
- tempname = self.shelfname + ".temp"
|
|
- try:
|
|
- indexCache = shelve.open(tempname)
|
|
- self.rewind()
|
|
- count = 0
|
|
- while 1:
|
|
- offset, line = self.file.tell(), self.file.readline()
|
|
- if not line: break
|
|
- key = line[:string.find(line, ' ')]
|
|
- if (count % 1000) == 0:
|
|
- print "%s..." % (key,),
|
|
- import sys
|
|
- sys.stdout.flush()
|
|
- indexCache[key] = line
|
|
- count = count + 1
|
|
- indexCache.close()
|
|
- os.rename(tempname, self.shelfname)
|
|
- finally:
|
|
- try: os.remove(tempname)
|
|
- except: pass
|
|
- print "done."
|
|
- self.indexCache = shelve.open(self.shelfname, 'r')
|
|
+ import shelve
|
|
+ import os
|
|
+ print("Building %s:" % (self.shelfname,), end=' ')
|
|
+ tempname = self.shelfname + ".temp"
|
|
+ try:
|
|
+ indexCache = shelve.open(tempname)
|
|
+ self.rewind()
|
|
+ count = 0
|
|
+ while 1:
|
|
+ offset, line = self.file.tell(), self.file.readline()
|
|
+ if not line: break
|
|
+ key = line[:string.find(line, ' ')]
|
|
+ if (count % 1000) == 0:
|
|
+ print("%s..." % (key,), end=' ')
|
|
+ import sys
|
|
+ sys.stdout.flush()
|
|
+ indexCache[key] = line
|
|
+ count = count + 1
|
|
+ indexCache.close()
|
|
+ os.rename(tempname, self.shelfname)
|
|
+ finally:
|
|
+ try: os.remove(tempname)
|
|
+ except: pass
|
|
+ print("done.")
|
|
+ self.indexCache = shelve.open(self.shelfname, 'r')
|
|
|
|
|
|
#
|
|
@@ -1099,20 +1098,20 @@
|
|
|
|
def _requirePointerType(pointerType):
|
|
if pointerType not in POINTER_TYPES:
|
|
- raise TypeError, `pointerType` + " is not a pointer type"
|
|
+ raise TypeError(repr(pointerType) + " is not a pointer type")
|
|
return pointerType
|
|
|
|
def _compareInstances(a, b, fields):
|
|
""""Return -1, 0, or 1 according to a comparison first by type,
|
|
then by class, and finally by each of fields.""" # " <- for emacs
|
|
if not hasattr(b, '__class__'):
|
|
- return cmp(type(a), type(b))
|
|
+ return cmp(type(a), type(b))
|
|
elif a.__class__ != b.__class__:
|
|
- return cmp(a.__class__, b.__class__)
|
|
+ return cmp(a.__class__, b.__class__)
|
|
for field in fields:
|
|
- diff = cmp(getattr(a, field), getattr(b, field))
|
|
- if diff:
|
|
- return diff
|
|
+ diff = cmp(getattr(a, field), getattr(b, field))
|
|
+ if diff:
|
|
+ return diff
|
|
return 0
|
|
|
|
def _equalsIgnoreCase(a, b):
|
|
@@ -1123,21 +1122,21 @@
|
|
>>> _equalsIgnoreCase('dOg', 'DOG')
|
|
1
|
|
"""
|
|
- return a == b or string.lower(a) == string.lower(b)
|
|
+ return a == b or a.lower() == b.lower()
|
|
|
|
#
|
|
# File utilities
|
|
#
|
|
def _dataFilePathname(filenameroot):
|
|
if os.name in ('dos', 'nt'):
|
|
- path = os.path.join(WNSEARCHDIR, filenameroot + ".dat")
|
|
+ path = os.path.join(WNSEARCHDIR, filenameroot + ".dat")
|
|
if os.path.exists(path):
|
|
return path
|
|
return os.path.join(WNSEARCHDIR, "data." + filenameroot)
|
|
|
|
def _indexFilePathname(filenameroot):
|
|
if os.name in ('dos', 'nt'):
|
|
- path = os.path.join(WNSEARCHDIR, filenameroot + ".idx")
|
|
+ path = os.path.join(WNSEARCHDIR, filenameroot + ".idx")
|
|
if os.path.exists(path):
|
|
return path
|
|
return os.path.join(WNSEARCHDIR, "index." + filenameroot)
|
|
@@ -1154,30 +1153,30 @@
|
|
#if count > 20:
|
|
# raise "infinite loop"
|
|
lastState = start, end
|
|
- middle = (start + end) / 2
|
|
- if cache.get(middle):
|
|
- offset, line = cache[middle]
|
|
- else:
|
|
- file.seek(max(0, middle - 1))
|
|
- if middle > 0:
|
|
- file.readline()
|
|
- offset, line = file.tell(), file.readline()
|
|
- if currentDepth < cacheDepth:
|
|
- cache[middle] = (offset, line)
|
|
+ middle = (start + end) / 2
|
|
+ if cache.get(middle):
|
|
+ offset, line = cache[middle]
|
|
+ else:
|
|
+ file.seek(max(0, middle - 1))
|
|
+ if middle > 0:
|
|
+ file.readline()
|
|
+ offset, line = file.tell(), file.readline()
|
|
+ if currentDepth < cacheDepth:
|
|
+ cache[middle] = (offset, line)
|
|
#print start, middle, end, offset, line,
|
|
- if offset > end:
|
|
- assert end != middle - 1, "infinite loop"
|
|
- end = middle - 1
|
|
- elif line[:keylen] == key:# and line[keylen + 1] == ' ':
|
|
- return line
|
|
+ if offset > end:
|
|
+ assert end != middle - 1, "infinite loop"
|
|
+ end = middle - 1
|
|
+ elif line[:keylen] == key:# and line[keylen + 1] == ' ':
|
|
+ return line
|
|
#elif offset == end:
|
|
# return None
|
|
- elif line > key:
|
|
- assert end != middle - 1, "infinite loop"
|
|
- end = middle - 1
|
|
- elif line < key:
|
|
- start = offset + len(line) - 1
|
|
- currentDepth = currentDepth + 1
|
|
+ elif line > key:
|
|
+ assert end != middle - 1, "infinite loop"
|
|
+ end = middle - 1
|
|
+ elif line < key:
|
|
+ start = offset + len(line) - 1
|
|
+ currentDepth = currentDepth + 1
|
|
thisState = start, end
|
|
if lastState == thisState:
|
|
# detects the condition where we're searching past the end
|
|
@@ -1206,12 +1205,12 @@
|
|
"""
|
|
index = 0
|
|
for element in sequence:
|
|
- value = element
|
|
- if keyfn:
|
|
- value = keyfn(value)
|
|
- if (not testfn and value == key) or (testfn and testfn(value, key)):
|
|
- return index
|
|
- index = index + 1
|
|
+ value = element
|
|
+ if keyfn:
|
|
+ value = keyfn(value)
|
|
+ if (not testfn and value == key) or (testfn and testfn(value, key)):
|
|
+ return index
|
|
+ index = index + 1
|
|
return None
|
|
|
|
def _partition(sequence, size, count):
|
|
@@ -1224,7 +1223,7 @@
|
|
|
|
partitions = []
|
|
for index in range(0, size * count, size):
|
|
- partitions.append(sequence[index:index + size])
|
|
+ partitions.append(sequence[index:index + size])
|
|
return (partitions, sequence[size * count:])
|
|
|
|
|
|
@@ -1269,49 +1268,49 @@
|
|
but the two implementations aren't directly comparable."""
|
|
|
|
def __init__(this, capacity):
|
|
- this.capacity = capacity
|
|
- this.clear()
|
|
+ this.capacity = capacity
|
|
+ this.clear()
|
|
|
|
def clear(this):
|
|
- this.values = {}
|
|
- this.history = {}
|
|
- this.oldestTimestamp = 0
|
|
- this.nextTimestamp = 1
|
|
+ this.values = {}
|
|
+ this.history = {}
|
|
+ this.oldestTimestamp = 0
|
|
+ this.nextTimestamp = 1
|
|
|
|
def removeOldestEntry(this):
|
|
- while this.oldestTimestamp < this.nextTimestamp:
|
|
- if this.history.get(this.oldestTimestamp):
|
|
- key = this.history[this.oldestTimestamp]
|
|
- del this.history[this.oldestTimestamp]
|
|
- del this.values[key]
|
|
- return
|
|
- this.oldestTimestamp = this.oldestTimestamp + 1
|
|
+ while this.oldestTimestamp < this.nextTimestamp:
|
|
+ if this.history.get(this.oldestTimestamp):
|
|
+ key = this.history[this.oldestTimestamp]
|
|
+ del this.history[this.oldestTimestamp]
|
|
+ del this.values[key]
|
|
+ return
|
|
+ this.oldestTimestamp = this.oldestTimestamp + 1
|
|
|
|
def setCapacity(this, capacity):
|
|
- if capacity == 0:
|
|
- this.clear()
|
|
- else:
|
|
- this.capacity = capacity
|
|
- while len(this.values) > this.capacity:
|
|
- this.removeOldestEntry()
|
|
+ if capacity == 0:
|
|
+ this.clear()
|
|
+ else:
|
|
+ this.capacity = capacity
|
|
+ while len(this.values) > this.capacity:
|
|
+ this.removeOldestEntry()
|
|
|
|
def get(this, key, loadfn=None):
|
|
- value = None
|
|
- if this.values:
|
|
- pair = this.values.get(key)
|
|
- if pair:
|
|
- (value, timestamp) = pair
|
|
- del this.history[timestamp]
|
|
- if value == None:
|
|
- value = loadfn and loadfn()
|
|
- if this.values != None:
|
|
- timestamp = this.nextTimestamp
|
|
- this.nextTimestamp = this.nextTimestamp + 1
|
|
- this.values[key] = (value, timestamp)
|
|
- this.history[timestamp] = key
|
|
- if len(this.values) > this.capacity:
|
|
- this.removeOldestEntry()
|
|
- return value
|
|
+ value = None
|
|
+ if this.values:
|
|
+ pair = this.values.get(key)
|
|
+ if pair:
|
|
+ (value, timestamp) = pair
|
|
+ del this.history[timestamp]
|
|
+ if value == None:
|
|
+ value = loadfn and loadfn()
|
|
+ if this.values != None:
|
|
+ timestamp = this.nextTimestamp
|
|
+ this.nextTimestamp = this.nextTimestamp + 1
|
|
+ this.values[key] = (value, timestamp)
|
|
+ this.history[timestamp] = key
|
|
+ if len(this.values) > this.capacity:
|
|
+ this.removeOldestEntry()
|
|
+ return value
|
|
|
|
|
|
class _NullCache:
|
|
@@ -1319,10 +1318,10 @@
|
|
LRUCache implements), but doesn't store any values."""
|
|
|
|
def clear():
|
|
- pass
|
|
+ pass
|
|
|
|
def get(this, key, loadfn=None):
|
|
- return loadfn and loadfn()
|
|
+ return loadfn and loadfn()
|
|
|
|
|
|
DEFAULT_CACHE_CAPACITY = 1000
|
|
@@ -1335,7 +1334,7 @@
|
|
def enableCache():
|
|
"""Enable the entity cache."""
|
|
if not isinstance(_entityCache, LRUCache):
|
|
- _entityCache = _LRUCache(size)
|
|
+ _entityCache = _LRUCache(size)
|
|
|
|
def clearCache():
|
|
"""Clear the entity cache."""
|
|
@@ -1373,36 +1372,36 @@
|
|
_POSNormalizationTable = {}
|
|
_POStoDictionaryTable = {}
|
|
for pos, abbreviations in (
|
|
- (NOUN, "noun n n."),
|
|
- (VERB, "verb v v."),
|
|
- (ADJECTIVE, "adjective adj adj. a s"),
|
|
- (ADVERB, "adverb adv adv. r")):
|
|
- tokens = string.split(abbreviations)
|
|
- for token in tokens:
|
|
- _POSNormalizationTable[token] = pos
|
|
- _POSNormalizationTable[string.upper(token)] = pos
|
|
+ (NOUN, "noun n n."),
|
|
+ (VERB, "verb v v."),
|
|
+ (ADJECTIVE, "adjective adj adj. a s"),
|
|
+ (ADVERB, "adverb adv adv. r")):
|
|
+ tokens = abbreviations.split()
|
|
+ for token in tokens:
|
|
+ _POSNormalizationTable[token] = pos
|
|
+ _POSNormalizationTable[token.upper()] = pos
|
|
for dict in Dictionaries:
|
|
- _POSNormalizationTable[dict] = dict.pos
|
|
- _POStoDictionaryTable[dict.pos] = dict
|
|
+ _POSNormalizationTable[dict] = dict.pos
|
|
+ _POStoDictionaryTable[dict.pos] = dict
|
|
|
|
_initializePOSTables()
|
|
|
|
def _normalizePOS(pos):
|
|
norm = _POSNormalizationTable.get(pos)
|
|
if norm:
|
|
- return norm
|
|
- raise TypeError, `pos` + " is not a part of speech type"
|
|
+ return norm
|
|
+ raise TypeError(repr(pos) + " is not a part of speech type")
|
|
|
|
def _dictionaryFor(pos):
|
|
pos = _normalizePOS(pos)
|
|
dict = _POStoDictionaryTable.get(pos)
|
|
if dict == None:
|
|
- raise RuntimeError, "The " + `pos` + " dictionary has not been created"
|
|
+ raise RuntimeError("The " + repr(pos) + " dictionary has not been created")
|
|
return dict
|
|
|
|
def buildIndexFiles():
|
|
for dict in Dictionaries:
|
|
- dict._buildIndexCacheFile()
|
|
+ dict._buildIndexCacheFile()
|
|
|
|
|
|
#
|
|
@@ -1412,7 +1411,7 @@
|
|
def _testKeys():
|
|
#This is slow, so don't do it as part of the normal test procedure.
|
|
for dictionary in Dictionaries:
|
|
- dictionary._testKeys()
|
|
+ dictionary._testKeys()
|
|
|
|
def _test(reset=0):
|
|
import doctest, wordnet
|