00001 #include "Vocabulary.hpp"
00002 #include "Converter.hpp"
00003 #include "Exception.hpp"
00004 #include <boost/filesystem/fstream.hpp>
00005
00006 namespace aitools {
00007 namespace invertedindex {
00008
00009 Vocabulary::Vocabulary()
00010 {}
00011
00012 Vocabulary::Vocabulary(const bfs::path& path) throw (std::invalid_argument)
00013 {
00014 load(path);
00015 }
00016
00017 Vocabulary::Vocabulary(const Vocabulary& vocabulary)
00018 : hash_map_(vocabulary.hash_map_)
00019 {}
00020
00021 Vocabulary::~Vocabulary()
00022 {}
00023
00024 void
00025 Vocabulary::accumulate(const std::string& word, uint64_t frequency)
00026 {
00027 std::pair<iterator, bool>
00028 insert_result(hash_map_.insert(std::make_pair(word, frequency)));
00029 if (!insert_result.second)
00030 {
00031 insert_result.first->second += frequency;
00032 }
00033 }
00034
00035 Vocabulary::const_iterator
00036 Vocabulary::begin() const
00037 {
00038 return hash_map_.begin();
00039 }
00040
00041 Vocabulary::iterator
00042 Vocabulary::begin()
00043 {
00044 return hash_map_.begin();
00045 }
00046
00047 void
00048 Vocabulary::clear()
00049 {
00050 hash_map_.clear();
00051 }
00052
00053 bool
00054 Vocabulary::contains(const std::string& word) const
00055 {
00056 return hash_map_.find(word) != hash_map_.end();
00057 }
00058
00059 Vocabulary::const_iterator
00060 Vocabulary::end() const
00061 {
00062 return hash_map_.end();
00063 }
00064
00065 Vocabulary::iterator
00066 Vocabulary::end()
00067 {
00068 return hash_map_.end();
00069 }
00070
00071 Vocabulary::const_iterator
00072 Vocabulary::find(const std::string& word) const
00073 {
00074 return hash_map_.find(word);
00075 }
00076
00077 Vocabulary::iterator
00078 Vocabulary::find(const std::string& word)
00079 {
00080 return hash_map_.find(word);
00081 }
00082
00083 uint64_t
00084 Vocabulary::frequency(const std::string& word) const
00085 {
00086 Vocabulary::const_iterator it(find(word));
00087 return it == end() ? 0 : it->second;
00088 }
00089
00090 void
00091 Vocabulary::get_words(std::vector<std::string>& words) const
00092 {
00093 words.clear();
00094 words.reserve(hash_map_.size());
00095 for (hash_map_t::const_iterator it(begin()); it != end(); ++it)
00096 {
00097 words.push_back(it->first);
00098 }
00099 std::sort(words.begin(), words.end());
00100 }
00101
00102 bool
00103 Vocabulary::insert(const std::string& word, uint64_t frequency)
00104 {
00105 return hash_map_.insert(std::make_pair(word, frequency)).second;
00106 }
00107
00108 bool
00109 Vocabulary::is_empty() const
00110 {
00111 return hash_map_.empty();
00112 }
00113
00114 void
00115 Vocabulary::load(const bfs::path& file) throw (std::invalid_argument)
00116 {
00117 bfs::ifstream ifs(file);
00118 if (!ifs)
00119 {
00120 Exception::throw_invalid_argument("Cannot open", file);
00121 }
00122 clear();
00123 std::string line;
00124 uint64_t frequency(0);
00125 std::string::size_type tabpos;
00126 while (std::getline(ifs, line))
00127 {
00128 tabpos = line.rfind(delim);
00129 if (tabpos != std::string::npos)
00130 {
00131 frequency = Converter::str_to_ui64(line.substr(tabpos));
00132 line = line.substr(0, tabpos);
00133 }
00134 accumulate(line, frequency);
00135 }
00136 ifs.close();
00137 }
00138
00139 void
00140 Vocabulary::save(const bfs::path& file) throw (std::invalid_argument)
00141 {
00142 bfs::ofstream ofs(file);
00143 if (!ofs)
00144 {
00145 Exception::throw_invalid_argument("Cannot create", file);
00146 }
00147 tree_map_t sorted_map(begin(), end());
00148 tree_map_t::const_iterator end(sorted_map.end());
00149 for (tree_map_t::const_iterator it(sorted_map.begin()); it != end; ++it)
00150 {
00151 ofs << it->first << delim << it->second << newline;
00152 }
00153 ofs.close();
00154 }
00155
00156 size_t
00157 Vocabulary::size() const
00158 {
00159 return hash_map_.size();
00160 }
00161
00162 }
00163 }
00164
00165 namespace std {
00166
00167 ostream&
00168 operator<<(ostream& os, const aitools::invertedindex::Vocabulary& vocabulary)
00169 {
00170 if (os)
00171 {
00172 os << "aitools::invertedindex::Vocabulary [ size = "
00173 << vocabulary.size() << " ]";
00174 }
00175 return os;
00176 }
00177
00178 }