00001 #include "PostlistBuilder.hpp"
00002 #include "System.hpp"
00003 #include <cstring>
00004
00005 namespace aitools {
00006 namespace invertedindex {
00007
00008 PostlistBuilder::PostlistBuilder()
00009 : vsizes_(new SizeVector),
00010 memory_(0),
00011 page_(NULL)
00012 {}
00013
00014 PostlistBuilder::~PostlistBuilder()
00015 {
00016 delete vsizes_;
00017 if (has_page_file_())
00018 {
00019 std::fclose(page_);
00020 }
00021 }
00022
00023 void
00024 PostlistBuilder::append(const ByteBuffer& value) throw (std::runtime_error)
00025 {
00026 if (has_page_file_())
00027 {
00028 System::fwrite(value.data(), 1, value.size(), page_);
00029 }
00030 else
00031 {
00032 memory_ += value.size();
00033 values_.push_back(value);
00034 if (memory_ > Iterator::max_chunk_size)
00035 {
00036 swap_out_values_();
00037 }
00038 }
00039 if (header_.value_count == 0)
00040 {
00041 header_.value_size = value.size();
00042 }
00043 else if (header_.value_size != value.size())
00044 {
00045 header_.value_size = 0;
00046 }
00047 vsizes_->push_back(value.size());
00048 header_.payload += value.size();
00049 ++header_.value_count;
00050 }
00051
00052 Iterator::SharedPointer
00053 PostlistBuilder::build()
00054 {
00055
00056 if (header_.value_size != 0)
00057 {
00058 vsizes_->clear();
00059 }
00060
00061 Iterator::SharedPointer iterator;
00062 if (has_page_file_())
00063 {
00064 std::rewind(page_);
00065 iterator.reset(new Iterator(header_, vsizes_, page_));
00066 vsizes_ = new SizeVector;
00067 page_ = NULL;
00068 }
00069 else
00070 {
00071 size_t offset(0);
00072 ByteBuffer* buffer(new ByteBuffer(header_.payload));
00073 ValueVector::const_iterator end(values_.end());
00074 for (ValueVector::const_iterator it(values_.begin()); it != end; ++it)
00075 {
00076 std::memcpy(buffer->data() + offset, it->data(), it->size());
00077 offset += it->size();
00078 }
00079 iterator.reset(new Iterator(header_, vsizes_, buffer));
00080 vsizes_ = new SizeVector;
00081 }
00082 clear();
00083 return iterator;
00084 }
00085
00086 void
00087 PostlistBuilder::clear()
00088 {
00089 memory_ = 0;
00090 values_.clear();
00091 vsizes_->clear();
00092 header_ = Header();
00093 if (has_page_file_())
00094 {
00095 std::fclose(page_);
00096 page_ = NULL;
00097 }
00098 }
00099
00100 bool
00101 PostlistBuilder::empty() const
00102 {
00103 return header_.value_count == 0;
00104 }
00105
00106 size_t
00107 PostlistBuilder::length() const
00108 {
00109 return header_.value_count;
00110 }
00111
00112 void
00113 PostlistBuilder::set_checksum(uint16_t checksum)
00114 {
00115 header_.checksum = checksum;
00116 }
00117
00118 void
00119 PostlistBuilder::swap_out_values_() throw (std::runtime_error)
00120 {
00121 page_ = System::tmpfile();
00122 ValueVector::const_iterator end(values_.end());
00123 for (ValueVector::const_iterator it(values_.begin()); it != end; ++it)
00124 {
00125 System::fwrite(it->data(), 1, it->size(), page_);
00126 }
00127 values_.clear();
00128 memory_ = 0;
00129 }
00130
00131 bool
00132 PostlistBuilder::has_page_file_() const
00133 {
00134 return page_ != NULL;
00135 }
00136
00137 }
00138 }