00001
00002
00003
00004 #ifndef AITOOLS_INVERTEDINDEX_EXTERNAL_MERGE_SORT_HPP
00005 #define AITOOLS_INVERTEDINDEX_EXTERNAL_MERGE_SORT_HPP
00006
00007 #include "InternalIntroSort.hpp"
00008 #include "System.hpp"
00009
00010 namespace aitools {
00011 namespace invertedindex {
00012
00023 class ExternalMergeSort {
00024
00025 private:
00026
00027 typedef Configuration::Sorting Sorting;
00028
00029 public:
00030
00031 static const size_t max_memory = 100 * 1024 * 1024;
00032
00033 private:
00034
00035 ExternalMergeSort();
00036
00037 public:
00038
00039 template<typename Value>
00040 static void
00041 sort(Postlist<Value>& postlist, Quantile& quantile, Sorting mode);
00042
00043 private:
00044
00045 template<typename Value>
00046 static void
00047 partition(Postlist<Value>& postlist, Quantile& quantile,
00048 std::vector<FILE*>& buckets, Sorting mode);
00049
00050 template<typename Value>
00051 static void
00052 merge(Postlist<Value>& postlist, std::vector<FILE*>& buckets, Sorting mode);
00053
00054 template<typename Value>
00055 static FILE*
00056 swap_out(const std::vector<Value>& entries);
00057
00058 };
00059
00060
00061
00062 template<typename Value>
00063 void ExternalMergeSort::sort
00064 (Postlist<Value>& postlist, Quantile& quantile, Sorting mode)
00065 {
00066 if (mode == Configuration::DISABLED) return;
00067
00068 if (postlist.payload() / max_memory == 0)
00069 {
00070 return InternalIntroSort::sort(postlist, quantile, mode);
00071 }
00072
00073
00074 std::vector<FILE*> buckets;
00075 partition(postlist, quantile, buckets, mode);
00076
00077
00078 merge(postlist, buckets, mode);
00079 }
00080
00081 template<typename Value>
00082 void ExternalMergeSort::partition
00083 (Postlist<Value>& postlist, Quantile& quantile, std::vector<FILE*>& buckets,
00084 Sorting mode)
00085 {
00086 Value value;
00087 quantile.clear();
00088 Quantile subquantile;
00089 std::vector<Value> entries;
00090 size_t bucket_count(postlist.payload() / max_memory);
00091 size_t entries_per_bucket(postlist.length() / bucket_count);
00092 entries.reserve(entries_per_bucket);
00093 while (postlist.next(value))
00094 {
00095 entries.push_back(value);
00096 if (entries.size() == entries_per_bucket)
00097 {
00098 InternalIntroSort::sort(entries, subquantile, mode);
00099 buckets.push_back(swap_out(entries));
00100 quantile += subquantile;
00101 entries.clear();
00102 }
00103 }
00104 if (!entries.empty())
00105 {
00106 InternalIntroSort::sort(entries, subquantile, mode);
00107 buckets.push_back(swap_out(entries));
00108 quantile += subquantile;
00109 entries.clear();
00110 }
00111 }
00112
00113 template<typename Value>
00114 void ExternalMergeSort::merge
00115 (Postlist<Value>& postlist, std::vector<FILE*>& buckets, Sorting mode)
00116 {
00117 ByteBuffer buffer;
00118
00119 uint16_t value_size;
00120 std::vector<Value> entries(buckets.size());
00121 for (unsigned i(0); i != buckets.size(); ++i)
00122 {
00123 std::rewind(buckets[i]);
00124 System::fread(&value_size, sizeof(value_size), 1, buckets[i]);
00125 buffer.resize(value_size);
00126 System::fread(buffer.data(), 1, value_size, buckets[i]);
00127 entries[i].wrap(buffer);
00128 }
00129 size_t index;
00130 PostlistBuilder builder;
00131 builder.set_checksum(postlist.iterator()->header().checksum);
00132 typename std::vector<Value>::iterator next_value;
00133 while (!entries.empty())
00134 {
00135 if (mode == Configuration::ASCENDING)
00136 {
00137 next_value = std::min_element(entries.begin(), entries.end());
00138 }
00139 else
00140 {
00141 next_value = std::max_element(entries.begin(), entries.end());
00142 }
00143 index = next_value - entries.begin();
00144 next_value->to_bytes(buffer);
00145 builder.append(buffer);
00146 try
00147 {
00148 System::fread(&value_size, sizeof(value_size), 1, buckets[index]);
00149 buffer.resize(value_size);
00150 System::fread(buffer.data(), 1, value_size, buckets[index]);
00151 next_value->wrap(buffer);
00152 }
00153 catch (const std::runtime_error& error)
00154 {
00155 std::fclose(buckets[index]);
00156 entries.erase(next_value);
00157 buckets.erase(buckets.begin() + index);
00158 }
00159 }
00160 postlist = builder.build();
00161 }
00162
00163 template<typename Value>
00164 FILE*
00165 ExternalMergeSort::swap_out(const std::vector<Value>& entries)
00166 {
00167 ByteBuffer buffer;
00168 FILE* bucket(System::tmpfile());
00169 Iterator::value_size_t value_size;
00170 typename std::vector<Value>::const_iterator end(entries.end());
00171 typename std::vector<Value>::const_iterator it(entries.begin());
00172 for (; it != end; ++it)
00173 {
00174 it->to_bytes(buffer);
00175 value_size = buffer.size();
00176 System::fwrite(&value_size, sizeof(value_size), 1, bucket);
00177 System::fwrite(buffer.data(), 1, value_size, bucket);
00178 }
00179 return bucket;
00180 }
00181
00182 }
00183 }
00184
00185 #endif // AITOOLS_INVERTEDINDEX_EXTERNAL_MERGE_SORT_HPP