diff options
| author | Gustav Sörnäs <gustav@sornas.net> | 2020-11-17 15:42:27 +0100 |
|---|---|---|
| committer | Gustav Sörnäs <gustav@sornas.net> | 2020-11-17 15:42:27 +0100 |
| commit | fb80ac50825c7ca1fa063d3493175b7b27adbdb1 (patch) | |
| tree | 4d84895d45ca2c177ddefe11164575bf6762b3ca /labb5/lib/StanfordCPPLib/lexicon.h | |
| parent | 4065799f7080260f507a5e3ea8c2d8375e735166 (diff) | |
| download | tddd86-fb80ac50825c7ca1fa063d3493175b7b27adbdb1.tar.gz | |
add given code
Diffstat (limited to 'labb5/lib/StanfordCPPLib/lexicon.h')
| -rwxr-xr-x | labb5/lib/StanfordCPPLib/lexicon.h | 366 |
1 files changed, 366 insertions, 0 deletions
diff --git a/labb5/lib/StanfordCPPLib/lexicon.h b/labb5/lib/StanfordCPPLib/lexicon.h new file mode 100755 index 0000000..fea4726 --- /dev/null +++ b/labb5/lib/StanfordCPPLib/lexicon.h @@ -0,0 +1,366 @@ +/* + * File: lexicon.h + * --------------- + * This file exports the <code>Lexicon</code> class, which is a + * compact structure for storing a list of words. + */ + +#ifndef _lexicon_h +#define _lexicon_h + +#include <string> +#include "foreach.h" +#include "set.h" +#include "stack.h" + +/* + * Class: Lexicon + * -------------- + * This class is used to represent a <b><i>lexicon,</i></b> or word list. + * The main difference between a lexicon and a dictionary is that + * a lexicon does not provide any mechanism for storing definitions; + * the lexicon contains only words, with no associated information. + * It is therefore similar to a set of strings, but with a more + * space-efficient internal representation. The <code>Lexicon</code> + * class supports efficient lookup operations for words and prefixes. + * + * <p>As an example of the use of the <code>Lexicon</code> class, the + * following program lists all the two-letter words in the lexicon + * stored in <code>EnglishWords.dat</code>: + * + *<pre> + * int main() { + * Lexicon english("EnglishWords.dat"); + * foreach (string word in english) { + * if (word.length() == 2) { + * cout << word << endl; + * } + * } + * return 0; + * } + *</pre> + */ + +#include <cctype> + +class Lexicon { + +public: + +/* + * Constructor: Lexicon + * Usage: Lexicon lex; + * Lexicon lex(filename); + * ----------------------------- + * Initializes a new lexicon. The default constructor creates an empty + * lexicon. The second form reads in the contents of the lexicon from + * the specified data file. The data file must be in one of two formats: + * (1) a space-efficient precompiled binary format or (2) a text file + * containing one word per line. The Stanford library distribution + * includes a binary lexicon file named <code>English.dat</code> + * containing a list of words in English. The standard code pattern + * to initialize that lexicon looks like this: + * + *<pre> + * Lexicon english("English.dat"); + *</pre> + */ + + Lexicon(); + Lexicon(std::string filename); + +/* + * Destructor: ~Lexicon + * -------------------- + * The destructor deallocates any storage associated with the lexicon. + */ + + virtual ~Lexicon(); + +/* + * Method: size + * Usage: int n = lex.size(); + * -------------------------- + * Returns the number of words contained in the lexicon. + */ + + int size() const; + +/* + * Method: isEmpty + * Usage: if (lex.isEmpty()) ... + * ----------------------------- + * Returns <code>true</code> if the lexicon contains no words. + */ + + bool isEmpty() const; + +/* + * Method: clear + * Usage: lex.clear(); + * ------------------- + * Removes all words from the lexicon. + */ + + void clear(); + +/* + * Method: add + * Usage: lex.add(word); + * --------------------- + * Adds the specified word to the lexicon. + */ + + void add(std::string word); + +/* + * Method: addWordsFromFile + * Usage: lex.addWordsFromFile(filename); + * -------------------------------------- + * Reads the file and adds all of its words to the lexicon. + */ + + void addWordsFromFile(std::string filename); + +/* + * Method: contains + * Usage: if (lex.contains(word)) ... + * ---------------------------------- + * Returns <code>true</code> if <code>word</code> is contained in the + * lexicon. In the <code>Lexicon</code> class, the case of letters is + * ignored, so "Zoo" is the same as "ZOO" or "zoo". + */ + + bool contains(std::string word) const; + +/* + * Method: containsPrefix + * Usage: if (lex.containsPrefix(prefix)) ... + * ------------------------------------------ + * Returns true if any words in the lexicon begin with <code>prefix</code>. + * Like <code>containsWord</code>, this method ignores the case of letters + * so that "MO" is a prefix of "monkey" or "Monday". + */ + + bool containsPrefix(std::string prefix) const; + +/* + * Method: mapAll + * Usage: lexicon.mapAll(fn); + * -------------------------- + * Calls the specified function on each word in the lexicon. + */ + + void mapAll(void (*fn)(std::string)) const; + void mapAll(void (*fn)(const std::string &)) const; + + template <typename FunctorType> + void mapAll(FunctorType fn) const; + +/* + * Additional Lexicon operations + * ----------------------------- + * In addition to the methods listed in this interface, the Lexicon + * class supports the following operations: + * + * - Deep copying for the copy constructor and assignment operator + * - Iteration using the range-based for statement and STL iterators + * + * All iteration is guaranteed to proceed in alphabetical order. All + * words in the lexicon are stored in lowercase. + */ + +/* Private section */ + +/**********************************************************************/ +/* Note: Everything below this point in the file is logically part */ +/* of the implementation and should not be of interest to clients. */ +/**********************************************************************/ + +private: + +#ifdef _WIN32 +#define LITTLE_ENDIAN 1 +#define BYTE_ORDER LITTLE_ENDIAN +#endif + +#pragma pack(1) + struct Edge { +#if defined(BYTE_ORDER) && BYTE_ORDER == LITTLE_ENDIAN + unsigned long letter:5; + unsigned long lastEdge:1; + unsigned long accept:1; + unsigned long unused:1; + unsigned long children:24; +#else + unsigned long children:24; + unsigned long unused:1; + unsigned long accept:1; + unsigned long lastEdge:1; + unsigned long letter:5; +#endif + }; +#pragma pack() + + Edge *edges, *start; + int numEdges, numDawgWords; + Set<std::string> otherWords; + +public: + +/* + * Deep copying support + * -------------------- + * This copy constructor and operator= are defined to make a + * deep copy, making it possible to pass/return lexicons by value + * and assign from one lexicon to another. The entire contents of + * the lexicon, including all words, are copied. Making copies is + * generally avoided because of the expense and thus, lexicons are + * typically passed by reference. When a copy is needed, these + * operations are supported. + */ + + Lexicon(const Lexicon & src); + Lexicon & operator=(const Lexicon & src); + +/* + * Iterator support + * ---------------- + * The classes in the StanfordCPPLib collection implement input + * iterators so that they work symmetrically with respect to the + * corresponding STL classes. + */ + + class iterator : public std::iterator<std::input_iterator_tag,std::string> { + private: + const Lexicon *lp; + int index; + std::string currentDawgPrefix; + std::string currentSetWord; + std::string tmpWord; + Edge *edgePtr; + Stack<Edge *> stack; + Set<std::string>::iterator setIterator; + Set<std::string>::iterator setEnd; + + void advanceToNextWordInDawg(); + void advanceToNextWordInSet(); + void advanceToNextEdge(); + + public: + iterator() { + this->lp = NULL; + } + + iterator(const Lexicon *lp, bool endFlag) { + this->lp = lp; + if (endFlag) { + index = lp->size(); + } else { + index = 0; + edgePtr = NULL; + setIterator = lp->otherWords.begin(); + setEnd = lp->otherWords.end(); + currentDawgPrefix = ""; + currentSetWord = ""; + advanceToNextWordInDawg(); + advanceToNextWordInSet(); + } + } + + iterator(const iterator & it) { + lp = it.lp; + index = it.index; + currentDawgPrefix = it.currentDawgPrefix; + currentSetWord = it.currentSetWord; + edgePtr = it.edgePtr; + stack = it.stack; + setIterator = it.setIterator; + } + + iterator & operator++() { + if (edgePtr == NULL) { + advanceToNextWordInSet(); + } else { + if (currentSetWord == "" || currentDawgPrefix < currentSetWord) { + advanceToNextWordInDawg(); + } else { + advanceToNextWordInSet(); + } + } + index++; + return *this; + } + + iterator operator++(int) { + iterator copy(*this); + operator++(); + return copy; + } + + bool operator==(const iterator & rhs) { + return lp == rhs.lp && index == rhs.index; + } + + bool operator!=(const iterator & rhs) { + return !(*this == rhs); + } + + std::string operator*() { + if (edgePtr == NULL) return currentSetWord; + if (currentSetWord == "" || currentDawgPrefix < currentSetWord) { + return currentDawgPrefix + lp->ordToChar(edgePtr->letter); + } else { + return currentSetWord; + } + } + + std::string *operator->() { + if (edgePtr == NULL) return ¤tSetWord; + if (currentSetWord == "" || currentDawgPrefix < currentSetWord) { + tmpWord = currentDawgPrefix + lp->ordToChar(edgePtr->letter); + return &tmpWord; + } else { + return ¤tSetWord; + } + } + + }; + + iterator begin() const { + return iterator(this, false); + } + + iterator end() const { + return iterator(this, true); + } + +private: + + Edge *findEdgeForChar(Edge *children, char ch) const; + Edge *traceToLastEdge(const std::string & s) const; + void readBinaryFile(std::string filename); + void deepCopy(const Lexicon & src); + int countDawgWords(Edge *start) const; + + unsigned int charToOrd(char ch) const { + return ((unsigned int)(tolower(ch) - 'a' + 1)); + } + + char ordToChar(unsigned int ord) const { + return ((char)(ord - 1 + 'a')); + } + +}; + +template <typename FunctorType> +void Lexicon::mapAll(FunctorType fn) const { + foreach (std::string word in *this) { + fn(word); + } +} + +// hashing functions for lexicons; defined in hashmap.cpp +int hashCode(const Lexicon& l); + +#endif |
