labb8/lib/StanfordCPPLib/lexicon.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366

/*
 * File: lexicon.h
 * ---------------
 * This file exports the <code>Lexicon</code> class, which is a
 * compact structure for storing a list of words.
 */

#ifndef _lexicon_h
#define _lexicon_h

#include <string>
#include "foreach.h"
#include "set.h"
#include "stack.h"

/*
 * Class: Lexicon
 * --------------
 * This class is used to represent a <b><i>lexicon,</i></b> or word list.
 * The main difference between a lexicon and a dictionary is that
 * a lexicon does not provide any mechanism for storing definitions;
 * the lexicon contains only words, with no associated information.
 * It is therefore similar to a set of strings, but with a more
 * space-efficient internal representation.  The <code>Lexicon</code>
 * class supports efficient lookup operations for words and prefixes.
 *
 * <p>As an example of the use of the <code>Lexicon</code> class, the
 * following program lists all the two-letter words in the lexicon
 * stored in <code>EnglishWords.dat</code>:
 *
 *<pre>
 *    int main() {
 *       Lexicon english("EnglishWords.dat");
 *       foreach (string word in english) {
 *          if (word.length() == 2) {
 *             cout << word << endl;
 *          }
 *       }
 *       return 0;
 *    }
 *</pre>
 */

#include <cctype>

class Lexicon {

public:

/*
 * Constructor: Lexicon
 * Usage: Lexicon lex;
 *        Lexicon lex(filename);
 * -----------------------------
 * Initializes a new lexicon.  The default constructor creates an empty
 * lexicon.  The second form reads in the contents of the lexicon from
 * the specified data file.  The data file must be in one of two formats:
 * (1) a space-efficient precompiled binary format or (2) a text file
 * containing one word per line.  The Stanford library distribution
 * includes a binary lexicon file named <code>English.dat</code>
 * containing a list of words in English.  The standard code pattern
 * to initialize that lexicon looks like this:
 *
 *<pre>
 *    Lexicon english("English.dat");
 *</pre>
 */

   Lexicon();
   Lexicon(std::string filename);

/*
 * Destructor: ~Lexicon
 * --------------------
 * The destructor deallocates any storage associated with the lexicon.
 */

   virtual ~Lexicon();

/*
 * Method: size
 * Usage: int n = lex.size();
 * --------------------------
 * Returns the number of words contained in the lexicon.
 */

   int size() const;

/*
 * Method: isEmpty
 * Usage: if (lex.isEmpty()) ...
 * -----------------------------
 * Returns <code>true</code> if the lexicon contains no words.
 */

   bool isEmpty() const;

/*
 * Method: clear
 * Usage: lex.clear();
 * -------------------
 * Removes all words from the lexicon.
 */

   void clear();

/*
 * Method: add
 * Usage: lex.add(word);
 * ---------------------
 * Adds the specified word to the lexicon.
 */

   void add(std::string word);

/*
 * Method: addWordsFromFile
 * Usage: lex.addWordsFromFile(filename);
 * --------------------------------------
 * Reads the file and adds all of its words to the lexicon.
 */

   void addWordsFromFile(std::string filename);

/*
 * Method: contains
 * Usage: if (lex.contains(word)) ...
 * ----------------------------------
 * Returns <code>true</code> if <code>word</code> is contained in the
 * lexicon.  In the <code>Lexicon</code> class, the case of letters is
 * ignored, so "Zoo" is the same as "ZOO" or "zoo".
 */

   bool contains(std::string word) const;

/*
 * Method: containsPrefix
 * Usage: if (lex.containsPrefix(prefix)) ...
 * ------------------------------------------
 * Returns true if any words in the lexicon begin with <code>prefix</code>.
 * Like <code>containsWord</code>, this method ignores the case of letters
 * so that "MO" is a prefix of "monkey" or "Monday".
 */

   bool containsPrefix(std::string prefix) const;

/*
 * Method: mapAll
 * Usage: lexicon.mapAll(fn);
 * --------------------------
 * Calls the specified function on each word in the lexicon.
 */

   void mapAll(void (*fn)(std::string)) const;
   void mapAll(void (*fn)(const std::string &)) const;

   template <typename FunctorType>
   void mapAll(FunctorType fn) const;

/*
 * Additional Lexicon operations
 * -----------------------------
 * In addition to the methods listed in this interface, the Lexicon
 * class supports the following operations:
 *
 *   - Deep copying for the copy constructor and assignment operator
 *   - Iteration using the range-based for statement and STL iterators
 *
 * All iteration is guaranteed to proceed in alphabetical order.  All
 * words in the lexicon are stored in lowercase.
 */

/* Private section */

/**********************************************************************/
/* Note: Everything below this point in the file is logically part    */
/* of the implementation and should not be of interest to clients.    */
/**********************************************************************/

private:

#ifdef _WIN32
#define LITTLE_ENDIAN 1
#define BYTE_ORDER LITTLE_ENDIAN
#endif

#pragma pack(1)
   struct Edge {
#if defined(BYTE_ORDER) && BYTE_ORDER == LITTLE_ENDIAN
      unsigned long letter:5;
      unsigned long lastEdge:1;
      unsigned long accept:1;
      unsigned long unused:1;
      unsigned long children:24;
#else
      unsigned long children:24;
      unsigned long unused:1;
      unsigned long accept:1;
      unsigned long lastEdge:1;
      unsigned long letter:5;
#endif
   };
#pragma pack()

   Edge *edges, *start;
   int numEdges, numDawgWords;
   Set<std::string> otherWords;

public:

/*
 * Deep copying support
 * --------------------
 * This copy constructor and operator= are defined to make a
 * deep copy, making it possible to pass/return lexicons by value
 * and assign from one lexicon to another.  The entire contents of
 * the lexicon, including all words, are copied.  Making copies is
 * generally avoided because of the expense and thus, lexicons are
 * typically passed by reference.  When a copy is needed, these
 * operations are supported.
 */

   Lexicon(const Lexicon & src);
   Lexicon & operator=(const Lexicon & src);

/*
 * Iterator support
 * ----------------
 * The classes in the StanfordCPPLib collection implement input
 * iterators so that they work symmetrically with respect to the
 * corresponding STL classes.
 */

   class iterator : public std::iterator<std::input_iterator_tag,std::string> {
   private:
      const Lexicon *lp;
      int index;
      std::string currentDawgPrefix;
      std::string currentSetWord;
      std::string tmpWord;
      Edge *edgePtr;
      Stack<Edge *> stack;
      Set<std::string>::iterator setIterator;
      Set<std::string>::iterator setEnd;

      void advanceToNextWordInDawg();
      void advanceToNextWordInSet();
      void advanceToNextEdge();

   public:
      iterator() {
         this->lp = NULL;
      }

      iterator(const Lexicon *lp, bool endFlag) {
         this->lp = lp;
         if (endFlag) {
            index = lp->size();
         } else {
            index = 0;
            edgePtr = NULL;
            setIterator = lp->otherWords.begin();
            setEnd = lp->otherWords.end();
            currentDawgPrefix = "";
            currentSetWord = "";
            advanceToNextWordInDawg();
            advanceToNextWordInSet();
         }
      }

      iterator(const iterator & it) {
         lp = it.lp;
         index = it.index;
         currentDawgPrefix = it.currentDawgPrefix;
         currentSetWord = it.currentSetWord;
         edgePtr = it.edgePtr;
         stack = it.stack;
         setIterator = it.setIterator;
      }

      iterator & operator++() {
         if (edgePtr == NULL) {
            advanceToNextWordInSet();
         } else {
            if (currentSetWord == "" || currentDawgPrefix < currentSetWord) {
               advanceToNextWordInDawg();
            } else {
               advanceToNextWordInSet();
            }
         }
         index++;
         return *this;
      }

      iterator operator++(int) {
         iterator copy(*this);
         operator++();
         return copy;
      }

      bool operator==(const iterator & rhs) {
         return lp == rhs.lp && index == rhs.index;
      }

      bool operator!=(const iterator & rhs) {
         return !(*this == rhs);
      }

      std::string operator*() {
         if (edgePtr == NULL) return currentSetWord;
         if (currentSetWord == "" || currentDawgPrefix < currentSetWord) {
            return currentDawgPrefix + lp->ordToChar(edgePtr->letter);
         } else {
            return currentSetWord;
         }
      }

      std::string *operator->() {
         if (edgePtr == NULL) return &currentSetWord;
         if (currentSetWord == "" || currentDawgPrefix < currentSetWord) {
            tmpWord = currentDawgPrefix + lp->ordToChar(edgePtr->letter);
            return &tmpWord;
         } else {
            return &currentSetWord;
         }
      }

   };

   iterator begin() const {
      return iterator(this, false);
   }

   iterator end() const {
      return iterator(this, true);
   }

private:

   Edge *findEdgeForChar(Edge *children, char ch) const;
   Edge *traceToLastEdge(const std::string & s) const;
   void readBinaryFile(std::string filename);
   void deepCopy(const Lexicon & src);
   int countDawgWords(Edge *start) const;

   unsigned int charToOrd(char ch) const {
      return ((unsigned int)(tolower(ch) - 'a' + 1));
   }

   char ordToChar(unsigned int ord) const {
      return ((char)(ord - 1 + 'a'));
   }

};

template <typename FunctorType>
void Lexicon::mapAll(FunctorType fn) const {
   foreach (std::string word in *this) {
      fn(word);
   }
}

// hashing functions for lexicons;  defined in hashmap.cpp
int hashCode(const Lexicon& l);

#endif