Source code for campy.datastructures.lexicon

"""
This file exports a :class:`Lexicon` class, a compact structure for storing a list of words.

This :class:`Lexicon` implementation is backed by a data structure called a prefix tree or trie ("try").
"""

# from ..decorators import print_args
import collections as _collections
import collections.abc as _collections_abc

[docs]class Lexicon(_collections.abc.MutableSet): """Representation of a :class:`Lexicon`, or word list. The main difference between a lexicon and a dictionary is that a lexicon does not provide any mechanism for storing definitions; the lexicon contains only words, with no associated information. It is therefore similar to a set of strings, but with a more space-efficient internal representation. The :class:`Lexicon` class supports efficient lookup operations for words and prefixes. For example, the following program lists all of the two-letter words in the lexicon stored at `english.lex`:: lex = Lexicon('english.lex') for word in lex: if len(word) == 2: print(word) """ def __init__(self, file=None): """Initialize a new lexicon. The default constructor creates an empty lexicon. The second form reads in the contents of the lexicon from a specified data filename. Usage:: lex = Lexicon() lex_with_words = Lexicon('english.lex') * Initializes a new lexicon. The default constructor creates an empty * lexicon. The second form reads in the contents of the lexicon from * the specified data file. The data file must be in one of two formats: * (1) a space-efficient precompiled binary format or (2) a text file * containing one word per line. The Stanford library distribution * includes a binary lexicon file named <code>English.dat</code> * containing a list of words in English. The standard code pattern * to initialize that lexicon looks like this: * *<pre> * Lexicon english("English.dat"); *</pre> */ """ self._root = None self.size = 0 if file: self._add_words_from_file(file) def _add_words_from_file(self, file, delimiter='\n'): with open(file, 'r') as f: raw = f.read() # TODO: make this dynamically loaded? for line in raw.split(delimiter): self.add(line)
[docs] def add(self, word): if not word or not word.isalpha(): return False word = word.lower() if not self._root: self._root = _TrieNode() return self._add_helper(self._root, word, word)
[docs] def clear(self): self.size = 0 self._root = None
def __contains__(self, word): if not word or not word.isalpha(): return False word = word.lower() return self._contains_helper(self._root, word, is_prefix=False)
[docs] def contains_prefix(self, word): if not word or not word.isalpha(): return False word = word.lower() return self._contains_helper(self._root, word, is_prefix=True)
[docs] def remove(self, word): if not word or not word.isalpha(): return False word = word.lower() return self._remove_helper(self._root, word, is_prefix=False)
discard = remove
[docs] def remove_prefix(self, word): if not word or not word.isalpha(): return False word = word.lower() return self._remove_helper(self._root, word, is_prefix=True)
def __len__(self): return self.size def __str__(self): return "Lexicon(num_words={self.size})".format(self=self) def __lt__(self): pass # deep copy? def __eq__(self, other): pass #TODO: this is gross def __iter__(self): yield from self.__iter_helper__(self._root, '') def __iter_helper__(self, node, sofar): if not node: return if node.is_word: yield sofar for letter in 'abcdefghijklmnopqrstuvwxyz': yield from self.__iter_helper__(node.get_child(letter), sofar + letter) def __hash__(self): pass def _add_helper(self, node, word, original): if not word: # We've reached the end, mark it as a word already_exists = node.is_word if not already_exists: self.size += 1 node.is_word = True return already_exists first, *rest = word # 'word' -> 'w', ['o', 'r', 'd'] child = node.get_child(first) if not child: child = node.add_child(first) return self._add_helper(child, rest, original) def _contains_helper(self, node, word, is_prefix=False): if not node: # BC: No node reaches this far, so the prefix must not exist. return False if not word: # BC: Found nodes this far. We found it if we're looking for a prefix or we're at a word return is_prefix or node.is_word # RC: Take one step forward return self._contains_helper(node.get_child(word[0]), word[1:], is_prefix) def _remove_helper(self, node, word, ): if not node: # BC: Dead end! return False if not word: # BC: We've walked down the whole tree pass
class _TrieNode(object): ALPHABET_SIZE = 26 def __init__(self): self.is_word = False self.children = [None for _ in range(_TrieNode.ALPHABET_SIZE)] self.num_children = 0 self.num_descendants = 0 def get_child(self, letter): """# pre: letter is between 'a' and 'z' in lowercase""" return self.children[ord(letter) - ord('a')] def add_child(self, letter): child = _TrieNode() self.children[ord(letter) - ord('a')] = child self.num_children += 1 self.num_descendants += 1 return child @property def is_leaf_node(): return self.num_children == 0 def _scrub(string): return ''.join(filter(str.islower, string))