Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • ads/learned-zindex
1 result
Show changes
Commits on Source (3)
Showing
with 122 additions and 4774 deletions
/**
* @file train_test_zindex.cpp
* @author Sachith (sachith.pai@helsinki.fi)
* @brief File to train and test the flood index
* @version 0.1
* @date 2022-05-04
*
* @copyright Copyright (c) 2022
*
*/
#include<iostream>
#include <vector>
#include <algorithm>
#include <time.h>
#include <random>
#include <chrono>
#include <fstream>
#include <stdlib.h>
#include<set>
#include<map>
#include<string>
#include"floodlite.h"
#include "../toml11-master/toml.hpp"
#include "../json.hpp"
using namespace std;
using json = nlohmann::json; // using this to dump various logs.
#define addeb if(0)
const string data_path = "./Datasets/";
int main(int argc, char* argv[])
{
const auto data_folder = string(argv[2]);
const auto config = toml::parse(data_path+data_folder+"/Experiments/config/"+string(argv[1])+".toml");
const auto point_class = toml::find<std::string>(config, "point_class");
const auto point_file = toml::find<std::string>(config, "point_file");
const auto query_file = toml::find<std::string>(config, "query_file");
cout.precision(17);
vector<Point> data;
double_t a, b, c, d;
ifstream pointsfile(data_path+data_folder+"/DataPoints/"+point_class+"/"+point_file);
while ( pointsfile >> a >> b)
data.push_back(Point(a,b));
pointsfile.close();
vector<pair<Point,Point>> queries;
ifstream queriesfile(data_path+data_folder+"/Queries/RangeQueries/"+query_file);
while (queriesfile >> a >> b >> c >> d)
queries.push_back(make_pair(Point(a,b),Point(c,d)));
queriesfile.close();
vector<Point> knn_queries;
ifstream knn_queriesfile(data_path+data_folder+"/Queries/KnnQueries/"+point_class);
while (knn_queriesfile >> a >> b)
knn_queries.push_back(Point(a,b));
knn_queriesfile.close();
vector<uint32_t> k_values = toml::find<std::vector<uint32_t>>(config, "knn_k_values");
vector<Point> point_queries;
ifstream point_queriesfile(data_path+data_folder+"/Queries/PointQueries/"+point_class);
while (point_queriesfile >> a >> b)
point_queries.push_back(Point(a,b));
point_queriesfile.close();
uint32_t insert_increments = uint32_t(data.size()*0.1);
uint32_t insert_rq_size = uint32_t(queries.size()*0.05);
vector<Point> insert_queries;
ifstream insert_queriesfile(data_path+data_folder+"/Queries/InsertQueries/"+point_class);
while (insert_queriesfile >> a >> b)
insert_queries.push_back(Point(a,b));
insert_queriesfile.close();
int page_size = toml::find<std::int32_t>(config, "page_size");
cout<<"Finished reading data and queries \n";
cout.flush();
auto flood_train_start = std::chrono::high_resolution_clock::now();
FloodLite flood_obj = FloodLite(data,queries,page_size);
auto flood_train_end = std::chrono::high_resolution_clock::now();
cout<<"Finished training FLOOD"<<endl;
{ /** Range queries*/
json flood_json;
flood_json["model"]="FLOOD";
flood_json["query_file"] = query_file;
flood_json["point_class"] = point_class;
flood_json["point_file"] = point_file;
flood_json["build_time"] = chrono::duration_cast<chrono::seconds>(flood_train_end - flood_train_start).count();
flood_json["config_id"] = string(argv[1]);
uint64_t result_size =0;
auto flood_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: queries){
vector<Point> range_query_result = flood_obj.RangeQuery(query);
result_size+=range_query_result.size();
}
auto flood_eval_end = std::chrono::high_resolution_clock::now();
flood_json["range_result_size"]=result_size;
flood_json["page_count"]=flood_obj.page_cnt_;
flood_json["node_count"]=0;
flood_json["index_size"]=flood_obj.ModelSize();
flood_json["range_query_time"] = chrono::duration_cast<chrono::nanoseconds>(flood_eval_end - flood_eval_start).count()/queries.size();
flood_json["range_query_scantime"] = flood_obj.TimeSpentScanningPages()/queries.size();
flood_json["range_query_page_accessed"]=flood_obj.NumPagesAcessed()/queries.size();
flood_json["range_query_points_scanned"]=flood_obj.NumElementsScanned()/queries.size();
ofstream o(data_path+data_folder+"/Experiments/result/Range.json",ios_base::app);
o << flood_json << std::endl;
o.close();
}
{ /** KNN querys */
json flood_json;
flood_json["model"]="FLOOD";
flood_json["point_class"] = point_class;
flood_json["point_file"] = point_file;
flood_json["config_id"] = string(argv[1]);
vector<Point> knn_query_result;
for(auto &k : k_values){
flood_obj.ClearMetric();
auto flood_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: knn_queries){
knn_query_result = flood_obj.KNNQuery(query,k);
}
auto flood_eval_end = std::chrono::high_resolution_clock::now();
flood_json["k"]=k;
flood_json["knn_query_time"]=chrono::duration_cast<chrono::nanoseconds>(flood_eval_end - flood_eval_start).count()/knn_queries.size();
flood_json["knn_query_scantime"] = flood_obj.TimeSpentScanningPages()/knn_queries.size();
flood_json["knn_query_page_accessed"]=flood_obj.NumPagesAcessed()/knn_queries.size();
flood_json["knn_query_points_scanned"]=flood_obj.NumElementsScanned()/knn_queries.size();
ofstream o(data_path+data_folder+"/Experiments/result/KNN.json",ios_base::app);
o << flood_json << std::endl;
o.close();
}
}
{ /** Point queries */
json flood_json;
flood_json["model"]="FLOOD";
flood_json["point_class"] = point_class;
flood_json["point_file"] = point_file;
flood_json["config_id"] = string(argv[1]);
bool point_query_result;
auto flood_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: point_queries){
point_query_result = flood_obj.PointQuery(query);
}
auto flood_eval_end = std::chrono::high_resolution_clock::now();
flood_json["point_query_time"]=chrono::duration_cast<chrono::nanoseconds>(flood_eval_end - flood_eval_start).count()/point_queries.size();
ofstream o(data_path+data_folder+"/Experiments/result/Point.json",ios_base::app);
o << flood_json << std::endl;
o.close();
}
cout<<"Point and KNN query"<<endl;
{ /** Insert queries */
json flood_json;
flood_json["model"]="FLOOD";
flood_json["point_class"] = point_class;
flood_json["point_file"] = point_file;
flood_json["config_id"] = string(argv[1]);
vector<uint64_t> insert_times;
vector<uint64_t> range_query_times;
uint32_t ins_ix=0;
for(int ins_epoch =0;ins_epoch<5;ins_epoch++){
auto flood_eval_start = std::chrono::high_resolution_clock::now();
for(int j=0;j<insert_increments;j++,ins_ix++){
// cout<<insert_queries[ins_ix].x_<<" "<<insert_queries[ins_ix].y_<<endl;
flood_obj.InsertElement(insert_queries[ins_ix]);
}
auto flood_eval_end = std::chrono::high_resolution_clock::now();
insert_times.push_back(chrono::duration_cast<chrono::nanoseconds>(flood_eval_end - flood_eval_start).count()/insert_increments);
// cout<<" ############## EPOCH: "<<ins_epoch<<" Done ############## "<<endl;
auto flood_rq_eval_start = std::chrono::high_resolution_clock::now();
for(int i=0;i<insert_rq_size;i++){
vector<Point> range_query_result = flood_obj.RangeQuery(queries[i]);
}
auto flood_rq_eval_end = std::chrono::high_resolution_clock::now();
range_query_times.push_back(chrono::duration_cast<chrono::nanoseconds>(flood_rq_eval_end - flood_rq_eval_start).count()/insert_rq_size);
}
flood_json["insert_query_time"]= insert_times;
flood_json["range_query_times"]= range_query_times;
ofstream o(data_path+data_folder+"/Experiments/result/Insert.json",ios_base::app);
o << flood_json << std::endl;
o.close();
}
cout<<"FLOOD DONE\n";
cout.flush();
return 0;
}
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
/**
* @file bounding_rectangle.h
* @author Sachith (sachith.pai@helsinki.fi)
* @brief A bounding rectangle instead of a point.
* @version 0.1
* @date 2022-04-25
*
* @copyright Copyright (c) 2022
*
*/
#ifndef RECT_H
#define RECT_H
#include<cmath>
#include<functional>
#include<limits.h>
#include<math.h>
#include"point.h"
// template<typename P=Point>
class BoundingRectangle{
public:
Point low_;
Point high_;
BoundingRectangle(){ }
BoundingRectangle(Point x,Point y): low_(x), high_(y){ }
bool IsThereOverlap(const BoundingRectangle other_box){
return (std::max(low_.x_,other_box.low_.x_)<std::min(high_.x_,other_box.high_.x_)
and std::max(low_.y_,other_box.low_.y_)<std::min(high_.y_,other_box.high_.y_));
}
bool CheckPointWithin(const Point& point){
return (point.x_ >= low_.x_ && point.x_<high_.x_ &&
point.y_ >= low_.y_ && point.y_<high_.y_);
}
bool operator==(const BoundingRectangle& other_mbr) const{
return (low_.x_ == other_mbr.low_.x_ && low_.y_ == other_mbr.low_.y_ && high_.x_ ==other_mbr.high_.x_ && high_.y_ ==other_mbr.high_.y_);
}
double_t Area(){
return (high_.x_-low_.x_)*(high_.y_-low_.y_);
}
/**
* @brief Function to check which of the four cases of the fwd iterators if any qualifies.
* Used as page_.func(query) to check the four cases in RangeQuerySkipping.
* @return uint8_t use the four least significant bits to convey the case.
*/
uint8_t CheckFwdIterCases(const BoundingRectangle other_box){
return ((high_.x_<other_box.low_.x_) | ((high_.y_<other_box.low_.y_)<<1) | ((low_.x_>other_box.high_.x_)<<2) | ((low_.y_>other_box.high_.y_)<<3));
// uint8_t case_mask =0;
// if(high_.x_<other_box.low_.x_) case_mask |= 1;
// if(high_.y_<other_box.low_.y_) case_mask |= 2;
// if(low_.x_>other_box.high_.x_) case_mask |= 4;
// if(low_.y_>other_box.high_.y_) case_mask |= 8;
// return case_mask;
}
};
// /**
// * @brief A equality checker function for bounding boxes.
// * Required to use unordered maps to memoize the optimal solution.
// */
// class EqualBoundingRectangle {
// public:
// bool operator()(const BoundingRectangle& a,const BoundingRectangle& b)
// {
// return (a.low_.x_ == b.low_.x_ && a.low_.y_ == b.low_.y_ && a.high_.x_ == b.high_.x_ && a.high_.y_ == b.high_.y_);
// }
// };
#endif
\ No newline at end of file
/**
* @file dens_est.h
* @author Sachith (sachith.pai@helsinki.fi)
* @brief A density estimation model based on K-D trees.
* @version 0.1
* @date 2022-08-17
*
* @copyright Copyright (c) 2022
*
*/
#ifndef DENS_EST_TREE
#define DENS_EST_TREE
#include<vector>
#include<algorithm>
#include<functional>
#include<iostream>
/**
* @brief Structs to act as const operators for the sort function.
*/
struct {
bool operator()(std::pair<double_t, double_t> a, std::pair<double_t, double_t> b) const { return a.first<b.first; }
}x_sort_order_struct;
struct {
bool operator()(std::pair<double_t, double_t> a, std::pair<double_t, double_t> b) const { return a.second<b.second; }
}y_sort_order_struct;
/**
* @brief Class to hold all the node information.
*
*/
class DensEstNode{
public:
double_t min_x_,min_y_,max_x_,max_y_;
double_t counts_;
double_t split_;
DensEstNode* children_[2];
DensEstNode(){
counts_ = 0;
min_x_=0;
min_y_=0;
max_x_=0;
max_y_=0;
children_[0]=NULL;
children_[1]=NULL;
}
};
/**
* @brief Root class of density estimation.
*
*/
class DensEstTree{
public:
DensEstNode* tree_x_;
DensEstNode* tree_y_;
/**
* @brief Construct a new Dens Est Tree object.
* - Given the data and the amount of granularity, partition the tree until you reach that granularity.
* @param data All data points
* @param granularity threshold at which you stop partitioning a node. (min 3)
*/
DensEstTree(std::vector<std::pair<double_t, double_t>> data, size_t granularity, double_t min_x, double_t min_y, double_t max_x, double_t max_y){
std::sort(data.begin(),data.end(),y_sort_order_struct);
tree_x_ = new DensEstNode();
tree_x_->counts_ = data.size();
tree_x_->min_x_ = min_x;
tree_x_->max_x_ = max_x;
tree_x_->min_y_ = min_y;
tree_x_->max_y_ = max_y;
BuildTree(tree_x_,data,granularity,true,min_x,min_y,max_x,max_y);
std::sort(data.begin(),data.end(),x_sort_order_struct);
tree_y_ = new DensEstNode();
tree_y_->counts_ = data.size();
tree_y_->min_x_ = min_x;
tree_y_->max_x_ = max_x;
tree_y_->min_y_ = min_y;
tree_y_->max_y_ = max_y;
BuildTree(tree_y_,data,granularity,false,min_x,min_y,max_x,max_y);
}
/**
* @brief A helper function to build the tree.
*
* @param node
* @param data
* @param granularity
* @param order_x `true` = split along x
*/
void BuildTree(DensEstNode* node, std::vector<std::pair<double_t, double_t>> data,size_t granularity, bool order_x,double_t min_x, double_t min_y, double_t max_x, double_t max_y){
if(data.size()<=granularity){
// std::cout<<"Granularity smaller than "<<granularity<<" :"<<data.size()<<std::endl;
return;
}
// std::cout<<"Splitting "<<data.size()<<" data points"<<std::endl;
size_t mid_point = data.size()/2;
if(order_x){
std::sort(data.begin(),data.end(),x_sort_order_struct);
node->split_=data[mid_point].first;
std::vector<std::pair<double_t, double_t>> new_vector_0(data.begin(),data.begin()+mid_point);
std::vector<std::pair<double_t, double_t>> new_vector_1(data.begin()+mid_point,data.end());
// std::cout<<"Order_x "<<order_x<<" Splits of size "<<data.size()<<" "<<new_vector_0.size()<<" "<<new_vector_1.size()<<std::endl;
node->children_[0]=new DensEstNode();
node->children_[0]->counts_ = new_vector_0.size();
node->children_[0]->min_x_ = min_x;
node->children_[0]->min_y_ = min_y;
node->children_[0]->max_x_ = node->split_;
node->children_[0]->max_y_ = max_y;
BuildTree(node->children_[0],new_vector_0,granularity,false,min_x,min_y,node->split_,max_y);
node->children_[1]=new DensEstNode();
node->children_[1]->counts_ = new_vector_1.size();
node->children_[1]->min_x_ = node->split_;
node->children_[1]->min_y_ = min_y;
node->children_[1]->max_x_ = max_x;
node->children_[1]->max_y_ = max_y;
BuildTree(node->children_[1],new_vector_1,granularity,false,node->split_,min_y,max_x,max_y);
}
else{
std::sort(data.begin(),data.end(),y_sort_order_struct);
node->split_=data[mid_point].second;
std::vector<std::pair<double_t, double_t>> new_vector_0(data.begin(),data.begin()+mid_point);
std::vector<std::pair<double_t, double_t>> new_vector_1(data.begin()+mid_point,data.end());
// std::cout<<"Order_x "<<order_x<<" Splits of size "<<data.size()<<" "<<new_vector_0.size()<<" "<<new_vector_1.size()<<std::endl;
node->children_[0]=new DensEstNode();
node->children_[0]->counts_ = new_vector_0.size();
node->children_[0]->min_x_ = min_x;
node->children_[0]->min_y_ = min_y;
node->children_[0]->max_x_ = max_x;
node->children_[0]->max_y_ = node->split_;
BuildTree(node->children_[0],new_vector_0,granularity,true,min_x,min_y,max_x,node->split_);
node->children_[1]=new DensEstNode();
node->children_[1]->counts_ = new_vector_1.size();
node->children_[1]->min_x_ = min_x;
node->children_[1]->min_y_ = node->split_;
node->children_[1]->max_x_ = max_x;
node->children_[1]->max_y_ = max_y;
BuildTree(node->children_[1],new_vector_1,granularity,false,min_x,node->split_,max_x,max_y);
}
return;
}
/**
* @brief The actual exposed function that estimates the counts of data in a given box.
* - It estimates the number of points returned by both the trees.
* - Averages it to return the values.
*/
double_t EstimateCount(double_t min_x, double_t min_y, double_t max_x, double_t max_y){
double_t tree_x_estimate = EstimateCountHelper(tree_x_,min_x,min_y,max_x,max_y,true);
// std::cout<<"Finished X-tree"<<std::endl;
double_t tree_y_estimate = EstimateCountHelper(tree_y_,min_x,min_y,max_x,max_y,false);
// std::cout<<"Finished Y-tree"<<std::endl;
return (tree_x_estimate+tree_y_estimate)/2L;
}
/**
* @brief The recursive function that goes through the density estimation tree and calculates
* counts within a range query.
* @param node
* @param min_x
* @param min_y
* @param max_x
* @param max_y
* @param order_x
* @return double_t
*/
double_t EstimateCountHelper(DensEstNode* node,double_t min_x,double_t min_y, double_t max_x,double_t max_y,bool order_x){
// std::cout<<order_x<<" ("<<min_x<<","<<min_y<<","<<max_x<<","<<max_y<<") ("<<node->min_x_<<","<<node->min_y_<<","<<node->max_x_<<","<<node->max_y_<<") "<<node->counts_<<std::endl;
/**
* @brief Returning if the current tree node has no overlap with query region.
*/
if(node->min_x_>=max_x || node->max_x_<=min_x || node->min_y_>=max_y || node->max_y_<=min_y ){
// if(node->min_x_>=max_x) std::cout<<"(1)"<<node->min_x_<<" "<<max_x<<std::endl;
// if(node->max_x_<=min_x) std::cout<<"(2)"<<node->max_x_<<" "<<min_x<<std::endl;
// if(node->min_y_>=max_y) std::cout<<"(3)"<<node->min_y_<<" "<<max_y<<std::endl;
// if(node->max_y_<=min_y) std::cout<<"(4)"<<node->max_y_<<" "<<min_y<<std::endl;
// std::cout<<"Found zero overlap"<<std::endl;
return 0L;
}
/**
* @brief IF the node region is completely inside the query range return the whole count.
*/
if(node->min_x_>=min_x && node->max_x_<max_x && node->min_y_>=min_y && node->max_y_<max_y)
return node->counts_;
/**
* @brief IF were are a leaf node return the value equivalent to ratio of overlap region between
* query region and node region times the number of points present in the node.
*/
if(node->children_[0]==NULL){
double_t lx = std::max(node->min_x_,min_x);
double_t hx = std::min(node->max_x_,max_x);
double_t ly = std::max(node->min_y_,min_y);
double_t hy = std::min(node->max_y_,max_y);
if (lx>=hx || ly>=hy)
return 0L;
double_t area_of_node= (node->max_x_-node->min_x_)*(node->max_y_-node->min_y_);
double_t area_of_overlap= (hx-lx)*(hy-ly);
// std::cout<<"area node/overlap "<<area_of_node<<" "<<area_of_overlap<<std::endl;
// std::cout<<"node counts "<<node->counts_<<std::endl;
return node->counts_*(area_of_overlap/area_of_node);
}
/**
* @brief recurse for lower level nodes if non of the above criteria are met
*/
double_t result = 0L;
for(int i=0;i<2;i++)
result+=EstimateCountHelper(node->children_[i],min_x,min_y,max_x,max_y,order_x^true);
// std::cout<<order_x<<" ("<<min_x<<","<<min_y<<","<<max_x<<","<<max_y<<") ("<<node->min_x_<<","<<node->min_y_<<","<<node->max_x_<<","<<node->max_y_<<") counts:"<<node->counts_<<" result:"<<result<<std::endl;
return result;
}
};
#endif
/**
* @file ind_ztree.h
* @author Sachith (sachith.pai@helsinki.fi)
* @brief Extending the `ZTree` with everything needed to build a index following the independance heuristic.
* @version 0.1
* @date 2022-04-25
*/
#ifndef IND_ZTREE_H
#define IND_ZTREE_H
#include<cstdlib>
#include<vector>
#include<algorithm>
#include<iterator>
#include<string>
#include"ztree.h"
#include"point.h"
#define deb if(0)
#define SKIPSIZE 3
class IndZTree: public ZTree{
public:
std::vector<Point> train_dataset;
std::vector<Point> train_query_extremes;
IndZTree(std::vector<Point> &dataset, std::vector<BoundingRectangle> &queries, uint32_t page_size):ZTree(page_size){
num_datapoints_ = dataset.size();
double_t data_low_x,data_low_y,data_high_x,data_high_y;
// wrap points in dataset into train points
train_dataset.clear();
for(auto& datapoint: dataset)
train_dataset.push_back(Point(datapoint));
//wrap queries into starts and ends.
train_query_extremes.clear();
for(auto& query: queries){
train_query_extremes.push_back(Point(query.low_));
train_query_extremes.push_back(Point(query.high_));
}
// sort the train points to give them rank
std::sort(train_dataset.begin(),train_dataset.end(),x_sort_order);
data_low_x=train_dataset[0].x_;
data_high_x=train_dataset[num_datapoints_-1].x_;
std::sort(train_dataset.begin(),train_dataset.end(),y_sort_order);
data_low_y=train_dataset[0].y_;
data_high_y=train_dataset[num_datapoints_-1].y_;
root_=new Node ();
node_cnt_++;
root_->region_.low_=Point(data_low_x,data_low_y);
root_->region_.high_=Point(data_high_x,data_high_y);
// std::cout<<"Data:\n";
BuildTree(root_,train_dataset.begin(),train_dataset.end(),train_query_extremes.begin(),train_query_extremes.end());
BulkLoadData(dataset);
// std::cout<<"\t IndZTree BuildTree done. PageCnt:["<<page_cnt_<<"] NodeCnt: ["<<node_cnt_<<"]"<<std::endl;
}
// write build tree
void BuildTree(Node *node,
std::vector<Point>::iterator it_data_begin,std::vector<Point>::iterator it_data_end,
std::vector<Point>::iterator it_query_begin,std::vector<Point>::iterator it_query_end,std::string debspace=""){
std::pair<Point,bool> partition_split = FindOptimalSplitPartition(node,it_data_begin,it_data_end,it_query_begin,it_query_end);
node->partition_ = partition_split.first;
node->ordering_ = partition_split.second;
// std::cout<<debspace<<" ("<<node->region_.low_.x_<<","<<node->region_.low_.y_<<","<<node->region_.high_.x_<<","<<node->region_.high_.y_<<") -> ("<<node->partition_.x_<<","<<node->partition_.y_<<")\n";
if(node->partition_.x_==-1 || node->partition_.y_==-1){
page_cnt_++;
page_array_.push_back(Page(BoundingRectangle(node->region_.low_,node->region_.high_)));
node->page_iter_ = page_array_.end();
node->page_iter_--;
return;
}
for(size_t i=0;i<4;i++)
node->child_[i]= new Node();
node_cnt_+=4;
std::vector<Point>::iterator it_A_data_begin=it_data_begin,it_B_data_begin,it_C_data_begin,it_D_data_begin;
std::vector<Point>::iterator it_A_data_end,it_B_data_end,it_C_data_end,it_D_data_end=it_data_end;
XComparatorPointPartition x_partition_predicate(node->partition_);
YComparatorPointPartition y_partition_predicate(node->partition_);
it_B_data_begin = std::partition(it_data_begin,it_data_end,x_partition_predicate);
it_C_data_end = it_B_data_begin;
it_C_data_begin = std::partition(it_A_data_begin,it_C_data_end,y_partition_predicate);
it_A_data_end = it_C_data_begin;
it_D_data_begin = std::partition(it_B_data_begin,it_D_data_end,y_partition_predicate);
it_B_data_end = it_D_data_begin;
std::vector<Point>::iterator it_A_query_begin=it_query_begin,it_B_query_begin,it_C_query_begin,it_D_query_begin;
std::vector<Point>::iterator it_A_query_end,it_B_query_end,it_C_query_end,it_D_query_end=it_query_end;
it_B_query_begin = std::partition(it_query_begin,it_query_end,x_partition_predicate);
it_C_query_end = it_B_query_begin;
it_C_query_begin = std::partition(it_A_query_begin,it_C_query_end,y_partition_predicate);
it_A_query_end = it_C_query_begin;
it_D_query_begin = std::partition(it_B_query_begin,it_D_query_end,y_partition_predicate);
it_B_query_end = it_D_query_begin;
node->child_[0]->region_ = BoundingRectangle(
Point(node->region_.low_.x_, node->region_.low_.y_),
Point(node->partition_.x_, node->partition_.y_));
BuildTree(node->child_[0],it_A_data_begin,it_A_data_end,it_A_query_begin,it_A_query_end,debspace+"-");
node->child_[1]->region_ = BoundingRectangle(
Point(node->partition_.x_, node->region_.low_.y_),
Point(node->region_.high_.x_, node->partition_.y_));
BuildTree(node->child_[1],it_B_data_begin,it_B_data_end,it_B_query_begin,it_B_query_end,debspace+"-");
node->child_[2]->region_ = BoundingRectangle(
Point(node->region_.low_.x_, node->partition_.y_),
Point(node->partition_.x_, node->region_.high_.y_));
BuildTree(node->child_[2],it_C_data_begin,it_C_data_end,it_C_query_begin,it_C_query_end,debspace+"-");
node->child_[3]->region_ = BoundingRectangle(
Point(node->partition_.x_, node->partition_.y_),
Point(node->region_.high_.x_, node->region_.high_.y_));
BuildTree(node->child_[3],it_D_data_begin,it_D_data_end,it_D_query_begin,it_D_query_end,debspace+"-");
}
std::pair<Point,bool> FindOptimalSplitPartition(Node *node,
std::vector<Point>::iterator it_data_begin,std::vector<Point>::iterator it_data_end,
std::vector<Point>::iterator it_query_begin,std::vector<Point>::iterator it_query_end){
std::vector<Point> collected_queries(it_query_begin,it_query_end);
uint32_t num_data_here = std::distance(it_data_begin,it_data_end);
// std::cout<<"Num data"<<num_data_here<<" queries"<<collected_queries.size()<<std::endl;
if(num_data_here<=page_size_){
return std::make_pair(Point(-1,-1),false);
}
if(collected_queries.size()==0){
Point split = Point(-1,-1);
size_t mid_ix = num_data_here/2;
std::sort(it_data_begin,it_data_end,x_sort_order);
split.x_=(*(it_data_begin+mid_ix)).x_-eps;
std::sort(it_data_begin,it_data_end,y_sort_order);
split.y_=(*(it_data_begin+mid_ix)).y_-eps;
return std::make_pair(split,false);
}
else{
double_t S_best =std::numeric_limits<double_t>::max(), F_x, F_y, G_x, G_y;
double_t F,G,S;
double_t split_x_,split_y_;
std::sort(it_data_begin,it_data_end,x_sort_order);
std::sort(collected_queries.begin(),collected_queries.end(),x_sort_order);
for(size_t ix=SKIPSIZE;ix<=num_data_here-SKIPSIZE;ix+=SKIPSIZE){
F = ix/(num_data_here*1.0);
G = find_x_rank(collected_queries,*(it_data_begin+ix))/(collected_queries.size()*1.0);
S = F*(1-(1-G)*(1-G))+(1-F)*(1-G*G);
if(S<S_best ){
F_x=F;
G_x=G;
S_best =S;
split_x_ = (*(it_data_begin+ix)).x_-eps;
}
}
S_best =std::numeric_limits<double_t>::max();
std::sort(it_data_begin,it_data_end,y_sort_order);
std::sort(collected_queries.begin(),collected_queries.end(),y_sort_order);
for(size_t ix=SKIPSIZE;ix<num_data_here-SKIPSIZE;ix+=SKIPSIZE){
F = ix/(num_data_here*1.0);
G = find_y_rank(collected_queries,*(it_data_begin+ix))/(collected_queries.size()*1.0);
S = F*(1-(1-G)*(1-G))+(1-F)*(1-G*G);
if(S<S_best ){
F_y=F;
G_y=G;
S_best =S;
split_y_ = (*(it_data_begin+ix)).y_-eps;
}
}
double_t shapecost_YX = 2*(1-F_x)*F_y *G_x*G_x*G_y*(1-G_y)
+ 2*F_x*(1-F_y) *(1-G_x)*(1-G_x)*G_y*(1-G_y);
double_t shapecost_XY = 2*(1-F_x)*F_y *G_x*(1-G_x)*G_y*G_y
+ 2*F_x*(1-F_y) *G_x*(1-G_x)*(1-G_y)*(1-G_y);
return std::make_pair(Point(split_x_,split_y_),(shapecost_XY<shapecost_YX));
}
}
// private:
uint32_t find_x_rank(std::vector<Point> &arr, Point &q){
uint32_t low =0,high=arr.size(),mid;
while(low<high){
mid = (low+high)/2;
if(x_sort_order(arr[mid],q))
low = mid+1;
else
high = mid;
}
return low;
}
uint32_t find_y_rank(std::vector<Point> &arr, Point &q){
uint32_t low =0,high=arr.size(),mid;
while(low<high){
mid = (low+high)/2;
if(y_sort_order(arr[mid],q))
low = mid+1;
else
high = mid;
}
return low;
}
};
#endif
\ No newline at end of file
/**
* @file optimal_ztree.h
* @author Sachith (sachith.pai@helsinki.fi)
* @brief An attempt at solving the for the optimal solution of paritioning and ordering
* for the recursive cost formulation.
* BUT, it was infeasible for any form of relaxations.
* This version attempts to subsample the recursive solution and solve.
* @version 0.3
* @date 2022-10-28
*
*/
#ifndef OPTIMAL_ZTREE_H
#define OPTIMAL_ZTREE_H
#include<cstdlib>
#include<ctime>
#include<cmath>
#include<vector>
#include<algorithm>
#include<iterator>
#include<utility>
#include<functional>
#include<limits>
#include<unordered_map>
#include"ztree.h"
#include"point.h"
#include"bounding_rectangle.h"
#include"dens_est.h"
class HashBoundingRectangle {
public:
size_t operator()(const BoundingRectangle& mbr)const
{
return (std::hash<double_t>{}(mbr.low_.x_) ^ std::hash<double_t>{}(mbr.low_.y_) ^
std::hash<double_t>{}(mbr.high_.x_) ^ std::hash<double_t>{}(mbr.high_.y_));
}
};
class OptimalZTree: public ZTree{
public:
// std::unordered_map<BoundingRectangle,double_t,HashBoundingRectangle,EqualBoundingRectangle> mempart;
std::unordered_map<BoundingRectangle,double_t,HashBoundingRectangle> memoize_obj_;
std::unordered_map<BoundingRectangle,std::pair<Point,bool>,HashBoundingRectangle> memoize_split_;
XComparatorPoint x_sort_order;
YComparatorPoint y_sort_order;
DensEstTree * datapoint_density_estimator_;
DensEstTree * query_starts_density_estimator_;
DensEstTree * query_ends_density_estimator_;
uint32_t random_sample_size_,dens_est_data_gran_,dens_est_query_gran_;
OptimalZTree(std::vector<Point> &dataset, std::vector<BoundingRectangle> &queries, uint32_t page_size,uint32_t random_sample_size=50,uint32_t dens_est_data_gran=10, uint32_t dens_est_query_gran=10):ZTree(page_size),random_sample_size_(random_sample_size),dens_est_data_gran_(dens_est_data_gran),dens_est_query_gran_(dens_est_query_gran){
std::cout<<"Starting Construct"<<std::endl;
num_datapoints_ = dataset.size();
double_t data_low_x,data_low_y,data_high_x,data_high_y;
std::sort(dataset.begin(),dataset.end(),x_sort_order); // DEV why do we need the data lows and highs
data_low_x=dataset[0].x_;
data_high_x=dataset[num_datapoints_-1].x_;
std::sort(dataset.begin(),dataset.end(),y_sort_order);
data_low_y=dataset[0].y_;
data_high_y=dataset[num_datapoints_-1].y_;
root_=new Node();
node_cnt_++;
root_->region_.low_=Point(data_low_x,data_low_y);
root_->region_.high_=Point(data_high_x,data_high_y);
std::cout<<"Points Sorted"<<std::endl;
ConstructDensityEstimators(dataset, queries,data_low_x,data_low_y,data_high_x+eps,data_high_y+eps);
std::srand(std::time(nullptr)); // use current time as seed for random generator
std::cout<<"Constructed density estimates."<<std::endl;
SolveDPProblem(root_->region_,dataset.begin(),dataset.end(),0);
std::cout<<"SolveDPProblem Done"<<std::endl;
BuildTree(root_);
BulkLoadData(dataset);
std::cout<<"Build Tree Done"<<std::endl;
// // should i note down all the memoize_obj elements?
// std::ofstream o("../opt_function.txt",std::ios_base::out);
// for (auto obj : memoize_obj_)
// o << obj.first.low_.x_<<" "<<obj.first.low_.y_<<" "<<obj.first.high_.x_<<" "<<obj.first.high_.y_<<" "<<obj.second<<std::endl;
// o.close();
}
void ConstructDensityEstimators(std::vector<Point> &dataset, std::vector<BoundingRectangle> &queries, double_t data_low_x, double_t data_low_y, double_t data_high_x, double_t data_high_y){
std::vector<std::pair<double_t,double_t>> rank_data_de;
for(auto &p: dataset)
rank_data_de.push_back(std::make_pair(p.x_,p.y_));
std::vector<std::pair<double_t,double_t>> rank_query_starts_de;
std::vector<std::pair<double_t,double_t>> rank_query_ends_de;
for(auto &q: queries){
rank_query_starts_de.push_back(std::make_pair(q.low_.x_,q.low_.y_));
rank_query_ends_de.push_back(std::make_pair(q.high_.x_,q.high_.y_));
}
std::cout<<"Ready to construct density estimates."<<std::endl;
datapoint_density_estimator_ = new DensEstTree(rank_data_de, dens_est_data_gran_,data_low_x,data_low_y,data_high_x,data_high_y);
std::cout<<"Constructed estimates for datapoint"<<std::endl;
query_starts_density_estimator_ = new DensEstTree(rank_query_starts_de,dens_est_query_gran_,data_low_x,data_low_y,data_high_x,data_high_y);
std::cout<<"Constructed estimates for start"<<std::endl;
query_ends_density_estimator_ = new DensEstTree(rank_query_ends_de,dens_est_query_gran_,data_low_x,data_low_y,data_high_x,data_high_y);
std::cout<<"Constructed estimates for ends"<<std::endl;
}
double_t SolveDPProblem(BoundingRectangle& region,std::vector<Point>::iterator it_data_begin,std::vector<Point>::iterator it_data_end, uint32_t level){
size_t num_data_here = std::distance(it_data_begin,it_data_end);
size_t mid_ix = num_data_here/2;
double_t estimate_query_start_here = query_starts_density_estimator_->EstimateCount(region.low_.x_,region.low_.y_,region.high_.x_,region.high_.y_);
double_t estimate_query_end_here = query_ends_density_estimator_->EstimateCount(region.low_.x_,region.low_.y_,region.high_.x_,region.high_.y_);
//for (int i=0;i<=level;std::cout<<"\t"<<++i){}
//std::cout<<"\tSolvDP"<<region.low_.x_<<" "<<region.low_.y_<<" "<<region.high_.x_<<" "<<region.high_.y_<<" ("<<num_data_here<<" "<<estimate_query_start_here<<" "<<estimate_query_end_here<<")"<<std::endl;
if(memoize_obj_.find(region)!=memoize_obj_.end()){
//for (int i=0;i<=level;std::cout<<"\t"<<++i){}
//std::cout<<" Returning stashed result with obj:"<<memoize_obj_[region]<<std::endl;
return memoize_obj_[region];
}
double_t last_mile_cost = (estimate_query_start_here+estimate_query_end_here) * num_data_here;
if(num_data_here < page_size_ ){
memoize_obj_[region]=last_mile_cost;
memoize_split_[region]=std::make_pair(Point(-1L,-1L),false);
//for (int i=0;i<=level;std::cout<<"\t"<<++i){}
//std::cout<<" Returning because leaf - size:"<<memoize_obj_[region]<<std::endl;
return memoize_obj_[region];
}
Point split = Point(-1,-1);
std::sort(it_data_begin,it_data_end,x_sort_order);
split.x_ = (*(it_data_begin+mid_ix)).x_ - eps;
std::sort(it_data_begin,it_data_end,y_sort_order);
split.y_ = (*(it_data_begin+mid_ix)).y_- eps;
std::pair<double_t,bool> default_split_obj = Objective(region,it_data_begin,it_data_end,split,level);
double_t best_obj_value = default_split_obj.first;
bool best_obj_shape= default_split_obj.second;
if((estimate_query_start_here+estimate_query_end_here)>1L){
for(int i=0;i<std::min(random_sample_size_,uint32_t(4*num_data_here));i++){
size_t candidate_ix_x = std::rand()%num_data_here;
size_t candidate_ix_y = std::rand()%num_data_here;
split.x_=(*(it_data_begin+candidate_ix_x)).x_;
split.y_=(*(it_data_begin+candidate_ix_y)).y_;
//for (int i=0;i<=level;std::cout<<"\t"<<++i){}
//std::cout<<" Trying a split ("<<split.x_<<" "<<split.y_<<std::endl;
std::pair<double_t,bool> objective_value = Objective(region,it_data_begin,it_data_end,split,level);
if(objective_value.first<best_obj_value){
split.x_ = (*(it_data_begin+candidate_ix_x)).x_;
split.y_ = (*(it_data_begin+candidate_ix_y)).y_;
best_obj_value=objective_value.first;
best_obj_shape=objective_value.second;
}
}
}
memoize_obj_[region] = best_obj_shape;
memoize_split_[region] = std::make_pair(split,best_obj_shape);
return memoize_obj_[region];
}
std::pair<double_t,bool> Objective(BoundingRectangle region,std::vector<Point>::iterator it_data_begin,std::vector<Point>::iterator it_data_end,Point partition_point,uint32_t level){
std::vector<Point>::iterator it_A_data_begin=it_data_begin,it_B_data_begin,it_C_data_begin,it_D_data_begin;
std::vector<Point>::iterator it_A_data_end,it_B_data_end,it_C_data_end,it_D_data_end=it_data_end;
XComparatorPointPartition x_partition_predicate(partition_point);
YComparatorPointPartition y_partition_predicate(partition_point);
it_B_data_begin = std::partition(it_data_begin,it_data_end,x_partition_predicate);
it_C_data_end = it_B_data_begin;
it_C_data_begin = std::partition(it_A_data_begin,it_C_data_end,y_partition_predicate);
it_A_data_end = it_C_data_begin;
it_D_data_begin = std::partition(it_B_data_begin,it_D_data_end,y_partition_predicate);
it_B_data_end = it_D_data_begin;
BoundingRectangle region_A = BoundingRectangle(
Point(region.low_.x_, region.low_.y_),
Point(partition_point.x_, partition_point.y_));
BoundingRectangle region_B = BoundingRectangle(
Point(partition_point.x_, region.low_.y_),
Point(region.high_.x_, partition_point.y_));
BoundingRectangle region_C = BoundingRectangle(
Point(region.low_.x_, partition_point.y_),
Point(partition_point.x_, region.high_.y_));
BoundingRectangle region_D = BoundingRectangle(
Point(partition_point.x_, partition_point.y_),
Point(region.high_.x_, region.high_.y_));
double_t lower_level_costs_A = SolveDPProblem(region_A,it_A_data_begin,it_A_data_end,level+1);
double_t lower_level_costs_B = SolveDPProblem(region_B,it_B_data_begin,it_B_data_end,level+1);
double_t lower_level_costs_C = SolveDPProblem(region_C,it_C_data_begin,it_C_data_end,level+1);
double_t lower_level_costs_D = SolveDPProblem(region_D,it_D_data_begin,it_D_data_end,level+1);
double_t lower_level_cost = lower_level_costs_A+lower_level_costs_B+lower_level_costs_C+lower_level_costs_D;
double_t na_raw,nb_raw,nc_raw,nd_raw,ea,eb,ec,ed,sa,sb,sc,sd;
double_t na,nb,nc,nd;
na_raw = datapoint_density_estimator_->EstimateCount(region.low_.x_,region.low_.y_,partition_point.x_,partition_point.y_);
nb_raw = datapoint_density_estimator_->EstimateCount(partition_point.x_,region.low_.y_,region.high_.x_,partition_point.y_);
nc_raw = datapoint_density_estimator_->EstimateCount(region.low_.x_,partition_point.y_,partition_point.x_,region.high_.y_);
nd_raw = datapoint_density_estimator_->EstimateCount(partition_point.x_,partition_point.y_,region.high_.x_,region.high_.y_);
if(na_raw<page_size_*0.25 || nb_raw<page_size_*0.25 || nc_raw<page_size_*0.25 || nd_raw<page_size_*0.25 )
return std::make_pair(std::numeric_limits<double_t>::max(),false);
// if((na+nb+nc+nd)>4*page_size_){
na = (3*ceil((na_raw-page_size_)/(3*page_size_))+1)*page_size_;
nb = (3*ceil((nb_raw-page_size_)/(3*page_size_))+1)*page_size_;
nc = (3*ceil((nc_raw-page_size_)/(3*page_size_))+1)*page_size_;
nd = (3*ceil((nd_raw-page_size_)/(3*page_size_))+1)*page_size_;
// }
// else{
// na = na_raw;
// nb = nb_raw;
// nc = nc_raw;
// nd = nd_raw;
// }
sa = query_starts_density_estimator_->EstimateCount(region.low_.x_,region.low_.y_,partition_point.x_,partition_point.y_);
sb = query_starts_density_estimator_->EstimateCount(partition_point.x_,region.low_.y_,region.high_.x_,partition_point.y_);
sc = query_starts_density_estimator_->EstimateCount(region.low_.x_,partition_point.y_,partition_point.x_,region.high_.y_);
sd = query_starts_density_estimator_->EstimateCount(partition_point.x_,partition_point.y_,region.high_.x_,region.high_.y_);
ea = query_ends_density_estimator_->EstimateCount(region.low_.x_,region.low_.y_,partition_point.x_,partition_point.y_);
eb = query_ends_density_estimator_->EstimateCount(partition_point.x_,region.low_.y_,region.high_.x_,partition_point.y_);
ec = query_ends_density_estimator_->EstimateCount(region.low_.x_,partition_point.y_,partition_point.x_,region.high_.y_);
ed = query_ends_density_estimator_->EstimateCount(partition_point.x_,partition_point.y_,region.high_.x_,region.high_.y_);
double_t shape_cost_YX = na*(eb+ec+ed) + nb*(sa+ec+ed) + nc*(sa+sb+ed) + nd*(sa+sb+sc);
double_t shape_cost_XY = na*(eb+ec+ed) + nb*(sa+sc+ed) + nc*(sa+eb+ed) + nd*(sa+sb+sc);
return std::make_pair(lower_level_cost+std::min(shape_cost_YX,shape_cost_XY),
(shape_cost_YX>shape_cost_XY));
}
uint32_t BuildTree(Node *node){
// std::cout<<"BuildTree "<<node->region_.low_.x_<<" "<<node->region_.low_.y_<<" "<<node->region_.high_.x_<<" "<<node->region_.high_.y_<<std::endl;
std::pair<Point,bool> memoized_split_shape = memoize_split_[node->region_];
node->partition_ = memoized_split_shape.first;
node->ordering_ = memoized_split_shape.second;
if(node->partition_.x_==-1 || node->partition_.y_==-1){
page_cnt_++;
page_array_.push_back(Page(BoundingRectangle(node->region_.low_,node->region_.high_)));
node->page_iter_ = page_array_.end();
node->page_iter_--;
return 1u;
}
for(size_t i=0;i<4;i++)
node->child_[i]= new Node();
node_cnt_+=4;
uint32_t running_offset =0;
node->child_[0]->region_ = BoundingRectangle(
Point(node->region_.low_.x_, node->region_.low_.y_),
Point(node->partition_.x_, node->partition_.y_));
running_offset += BuildTree(node->child_[0]);
node->child_[1]->page_offset_ = running_offset;
node->child_[1]->region_ = BoundingRectangle(
Point(node->partition_.x_, node->region_.low_.y_),
Point(node->region_.high_.x_, node->partition_.y_));
running_offset += BuildTree(node->child_[1]);
node->child_[2]->page_offset_ = running_offset;
node->child_[2]->region_ = BoundingRectangle(
Point(node->region_.low_.x_, node->partition_.y_),
Point(node->partition_.x_, node->region_.high_.y_));
running_offset += BuildTree(node->child_[2]);
node->child_[3]->page_offset_ = running_offset;
node->child_[3]->region_ = BoundingRectangle(
Point(node->partition_.x_, node->partition_.y_),
Point(node->region_.high_.x_, node->region_.high_.y_));
running_offset += BuildTree(node->child_[3]);
return running_offset;
}
};
#endif
\ No newline at end of file
#ifndef PAGE_H
#define PAGE_H
#include<vector>
#include"point.h"
#include"bounding_rectangle.h"
class Page{
public:
BoundingRectangle page_region_;
std::vector<Point> page_data_;
uint64_t page_mask_; // Instead of having pages, we can assign each page a 'mask' based on Z-order like bit flip at each level.
Page(BoundingRectangle region):page_region_(region){}
std::list<Page>::iterator fwd_iter_1_; // LEFT Points to a later page with high_.x_ higher that current page.
std::list<Page>::iterator fwd_iter_2_; // BELOW Points to a later page with high_.y_ higher that current page.
std::list<Page>::iterator fwd_iter_3_; // RIGHT Points to a later page with low.x_ lower that current page.
std::list<Page>::iterator fwd_iter_4_; // ABOVE Points to a later page with low.y_ lower that current page.
void InitializeFwdIters(std::list<Page>::iterator iter){
fwd_iter_1_ = iter;
fwd_iter_2_ = iter;
fwd_iter_3_ = iter;
fwd_iter_4_ = iter;
}
double_t DistanceToEdge(const Point& other)
{
return std::min( std::min(abs(other.x_-page_region_.low_.x_),abs(other.y_-page_region_.low_.y_)),
std::min(abs(other.x_-page_region_.high_.x_),abs(other.y_-page_region_.high_.y_)));
}
};
#endif
\ No newline at end of file
/**
* @file point.h
* @author Sachith (sachith.pai@helsinki.fi)
* @brief Class to hold a point
* @version 0.1
* @date 2022-04-25
*
* @copyright Copyright (c) 2022
*
*/
#ifndef POINT_H
#define POINT_H
// #include"helper.h"
/**
* @brief The basic point in our setting.
*/
class Point{
public:
double_t x_;
double_t y_;
Point():x_(-1),y_(-1){ }
Point(double_t x,double_t y): x_(x), y_(y){ }
Point(const Point &xy){
x_=xy.x_;
y_=xy.y_;
}
};
/*
* @brief A comparator class for a pair of training points to help sort a vector wrt `x_`
*
*/
class XComparatorPoint {
public:
bool operator()(const Point& a,const Point& b)
{
return (a.x_<b.x_);
}
}x_sort_order;
/**
* @brief A comparator class for a pair of training points to help sort a vector wrt `y_`
*
*/
class YComparatorPoint {
public:
bool operator()(const Point& a,const Point& b)
{
return (a.y_<b.y_);
}
}y_sort_order;
/*
* @brief A comparator class for a pair of training points to help sort a vector wrt `x_`
*
*/
class XComparatorPointPartition {
public:
Point orig;
XComparatorPointPartition(const Point &point):orig(point){ }
bool operator()(const Point& a)
{
return (a.x_<orig.x_);
}
};
/*
* @brief A comparator class for a pair of training points to help sort a vector wrt `x_`
*
*/
class YComparatorPointPartition {
public:
Point orig;
YComparatorPointPartition(const Point &point):orig(point){ }
bool operator()(const Point& a)
{
return (a.y_<orig.y_);
}
};
/*
* @brief A comparator class for sorting points according to distance for KNN queries.
*/
class DistanceComparator {
public:
Point orig;
DistanceComparator(const Point &point):orig(point){ }
bool operator()(const Point& a,const Point& b)
{
double_t distance_a = (orig.x_-a.x_)*(orig.x_-a.x_) + (orig.y_-a.y_)*(orig.y_-a.y_);
double_t distance_b = (orig.x_-b.x_)*(orig.x_-b.x_) + (orig.y_-b.y_)*(orig.y_-b.y_);
return (distance_a<distance_b);
}
double_t Distance(const Point& a)
{
return sqrt((orig.x_-a.x_)*(orig.x_-a.x_) + (orig.y_-a.y_)*(orig.y_-a.y_));
}
};
#endif
This diff is collapsed.
/**
* @file train_test_zindex.cpp
* @author Sachith (sachith.pai@helsinki.fi)
* @brief File to train and test the zindex.
* @version 0.1
* @date 2022-05-04
*
* @copyright Copyright (c) 2022
*
*/
#include<iostream>
#include <vector>
#include <algorithm>
#include <time.h>
#include <random>
#include <chrono>
#include <fstream>
#include <stdlib.h>
#include<set>
#include<map>
#include<string>
#include<ind_ztree.h>
#include<sampl_ztree.h>
#include<vanilla_ztree.h>
// #include<optimal_ztree.h>
#include "../toml11-master/toml.hpp"
#include "../json.hpp"
using namespace std;
using json = nlohmann::json; // using this to dump various logs.
#define addeb if(0)
const string data_path = "../Datasets/";
int main(int argc, char* argv[])
{
const auto data_folder = string(argv[2]);
const auto experiment_name = string(argv[3]);
const auto config = toml::parse(data_path+data_folder+"/Experiments/"+experiment_name+"/config/"+string(argv[1])+".toml");
const auto point_class = toml::find<std::string>(config, "point_class");
const auto point_file = toml::find<std::string>(config, "point_file");
const auto query_file = toml::find<std::string>(config, "query_file");
cout.precision(17);
vector<Point> data;
double_t a, b, c, d;
ifstream pointsfile(data_path+data_folder+"/DataPoints/"+point_class+"/"+point_file);
while ( pointsfile >> a >> b)
data.push_back(Point(a,b));
pointsfile.close();
uint32_t insert_increments = uint32_t(data.size()*0.1);
cout<<"Finished reading data "<<data.size()<<endl;
vector<BoundingRectangle> queries;
ifstream queriesfile(data_path+data_folder+"/Queries/RangeQueries/"+query_file);
while (queriesfile >> a >> b >> c >> d)
queries.push_back(BoundingRectangle(Point(a,b),Point(c,d)));
uint32_t insert_rq_size = uint32_t(queries.size()*0.05);
queriesfile.close();
vector<Point> knn_queries;
ifstream knn_queriesfile(data_path+data_folder+"/Queries/KnnQueries/"+point_class);
while (knn_queriesfile >> a >> b)
knn_queries.push_back(Point(a,b));
knn_queriesfile.close();
vector<uint32_t> k_values = toml::find<std::vector<uint32_t>>(config, "knn_k_values");
vector<Point> point_queries;
ifstream point_queriesfile(data_path+data_folder+"/Queries/PointQueries/"+point_class);
while (point_queriesfile >> a >> b)
point_queries.push_back(Point(a,b));
point_queriesfile.close();
vector<Point> insert_queries;
ifstream insert_queriesfile(data_path+data_folder+"/Queries/InsertQueries/"+point_class);
while (insert_queriesfile >> a >> b)
insert_queries.push_back(Point(a,b));
insert_queriesfile.close();
int page_size = toml::find<std::int32_t>(config, "page_size");
//############ VANILLA ############
{
ZTree vanilla_eval_obj(page_size);
vanilla_eval_obj.LoadTree(data_path+data_folder+"/Experiments/"+experiment_name+"/trees/VANILLA"+string(argv[1])+".txt");
vanilla_eval_obj.BulkLoadData(data);
{ // KNN querys
json vanilla_json;
vanilla_json["model"]="VANILLA";
vanilla_json["point_class"] = point_class;
vanilla_json["point_file"] = point_file;
vanilla_json["config_id"] = string(argv[1]);
vector<Point> knn_query_result;
for(auto &k : k_values){
vanilla_eval_obj.ClearMetric();
vanilla_json["k"] = k;
auto vanilla_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: knn_queries){
knn_query_result = vanilla_eval_obj.KNNQuery(query,k);
}
auto vanilla_eval_end = std::chrono::high_resolution_clock::now();
vanilla_json["knn_query_time"]=chrono::duration_cast<chrono::nanoseconds>(vanilla_eval_end - vanilla_eval_start).count()/knn_queries.size();
vanilla_json["knn_query_scantime"] = vanilla_eval_obj.TimeSpentScanningPages()/knn_queries.size();
vanilla_json["knn_query_page_accessed"]=vanilla_eval_obj.NumPagesAcessed()/knn_queries.size();
vanilla_json["knn_query_points_scanned"]=vanilla_eval_obj.NumElementsScanned()/knn_queries.size();
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/KNN.json",ios_base::app);
o << vanilla_json << std::endl;
o.close();
}
}
{ // Point queries
json vanilla_json;
vanilla_json["model"]="VANILLA";
vanilla_json["point_class"] = point_class;
vanilla_json["point_file"] = point_file;
vanilla_json["config_id"] = string(argv[1]);
bool point_query_result;
auto vanilla_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: point_queries){
point_query_result = vanilla_eval_obj.PointQuery(query);
}
auto vanilla_eval_end = std::chrono::high_resolution_clock::now();
vanilla_json["point_query_time"]=chrono::duration_cast<chrono::nanoseconds>(vanilla_eval_end - vanilla_eval_start).count()/point_queries.size();
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/Point.json",ios_base::app);
o << vanilla_json << std::endl;
o.close();
}
{ // Insert queries
json vanilla_json;
vanilla_json["model"]="VANILLA";
vanilla_json["point_class"] = point_class;
vanilla_json["point_file"] = point_file;
vanilla_json["config_id"] = string(argv[1]);
vector<uint64_t> insert_times;
vector<uint64_t> range_query_times;
uint32_t ins_ix=0;
for(int ins_epoch =0;ins_epoch<5;ins_epoch++){
auto vanilla_eval_start = std::chrono::high_resolution_clock::now();
for(int j=0;j<insert_increments;j++,ins_ix++){
vanilla_eval_obj.InsertElement(insert_queries[ins_ix]);
}
auto vanilla_eval_end = std::chrono::high_resolution_clock::now();
insert_times.push_back(chrono::duration_cast<chrono::nanoseconds>(vanilla_eval_end - vanilla_eval_start).count()/insert_increments);
auto vanilla_rq_eval_start = std::chrono::high_resolution_clock::now();
for(int i=0;i<insert_rq_size;i++){
vector<Point> range_query_result = vanilla_eval_obj.RangeQuery(queries[i]);
}
auto vanilla_rq_eval_end = std::chrono::high_resolution_clock::now();
range_query_times.push_back(chrono::duration_cast<chrono::nanoseconds>(vanilla_rq_eval_end - vanilla_rq_eval_start).count()/insert_rq_size);
}
vanilla_json["insert_query_time"]= insert_times;
vanilla_json["range_query_times"]= range_query_times;
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/Insert.json",ios_base::app);
o << vanilla_json << std::endl;
o.close();
}
}
cout<<"VANILLA DONE\n";
cout.flush();
//############ SAMPL ############
{
uint32_t data_gran = toml::find<std::uint32_t>(config, "data_dens_est_granularity");
uint32_t query_gran = toml::find<std::uint32_t>(config, "query_dens_est_granularity");
uint32_t num_sampl = toml::find<std::uint32_t>(config, "num_samples");
ZTree sampl_eval_obj(page_size);
sampl_eval_obj.LoadTree(data_path+data_folder+"/Experiments/"+experiment_name+"/trees/SAMPL"+string(argv[1])+".txt");
sampl_eval_obj.BulkLoadData(data);
{ // KNN querys
json sampl_json;
sampl_json["model"]="SAMPL";
sampl_json["point_class"] = point_class;
sampl_json["point_file"] = point_file;
sampl_json["config_id"] = string(argv[1]);
vector<Point> knn_query_result;
for(auto &k : k_values){
sampl_eval_obj.ClearMetric();
sampl_json["k"] = k;
auto sampl_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: knn_queries){
knn_query_result = sampl_eval_obj.KNNQuery(query,k);
}
auto sampl_eval_end = std::chrono::high_resolution_clock::now();
sampl_json["knn_query_time"]=chrono::duration_cast<chrono::nanoseconds>(sampl_eval_end - sampl_eval_start).count()/knn_queries.size();
sampl_json["knn_query_scantime"] = sampl_eval_obj.TimeSpentScanningPages()/knn_queries.size();
sampl_json["knn_query_page_accessed"]=sampl_eval_obj.NumPagesAcessed()/knn_queries.size();
sampl_json["knn_query_points_scanned"]=sampl_eval_obj.NumElementsScanned()/knn_queries.size();
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/KNN.json",ios_base::app);
o << sampl_json << std::endl;
o.close();
}
}
{ // Point queries
json sampl_json;
sampl_json["model"]="SAMPL";
sampl_json["point_class"] = point_class;
sampl_json["point_file"] = point_file;
sampl_json["config_id"] = string(argv[1]);
bool point_query_result;
auto sampl_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: point_queries){
point_query_result = sampl_eval_obj.PointQuery(query);
}
auto sampl_eval_end = std::chrono::high_resolution_clock::now();
sampl_json["point_query_time"]=chrono::duration_cast<chrono::nanoseconds>(sampl_eval_end - sampl_eval_start).count()/point_queries.size();
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/Point.json",ios_base::app);
o << sampl_json << std::endl;
o.close();
}
{ // Insert queries
json sampl_json;
sampl_json["model"]="SAMPL";
sampl_json["point_class"] = point_class;
sampl_json["point_file"] = point_file;
sampl_json["config_id"] = string(argv[1]);
vector<uint64_t> insert_times;
vector<uint64_t> range_query_times;
uint32_t ins_ix=0;
for(int ins_epoch =0;ins_epoch<5;ins_epoch++){
auto sampl_eval_start = std::chrono::high_resolution_clock::now();
for(int j=0;j<insert_increments;j++,ins_ix++){
sampl_eval_obj.InsertElement(insert_queries[ins_ix]);
}
auto sampl_eval_end = std::chrono::high_resolution_clock::now();
insert_times.push_back(chrono::duration_cast<chrono::nanoseconds>(sampl_eval_end - sampl_eval_start).count()/insert_increments);
auto sampl_rq_eval_start = std::chrono::high_resolution_clock::now();
for(int i=0;i<insert_rq_size;i++){
vector<Point> range_query_result = sampl_eval_obj.RangeQuery(queries[i]);
}
auto sampl_rq_eval_end = std::chrono::high_resolution_clock::now();
range_query_times.push_back(chrono::duration_cast<chrono::nanoseconds>(sampl_rq_eval_end - sampl_rq_eval_start).count()/insert_rq_size);
}
sampl_json["insert_query_time"]= insert_times;
sampl_json["range_query_times"]= range_query_times;
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/Insert.json",ios_base::app);
o << sampl_json << std::endl;
o.close();
}
}
cout<<"SAMPL DONE\n";
cout.flush();
//############ SAMPLPLUS ############
{
uint32_t data_gran = toml::find<std::uint32_t>(config, "data_dens_est_granularity");
uint32_t query_gran = toml::find<std::uint32_t>(config, "query_dens_est_granularity");
uint32_t num_sampl = toml::find<std::uint32_t>(config, "num_samples");
ZTree samplplus_eval_obj(page_size,true);
samplplus_eval_obj.LoadTree(data_path+data_folder+"/Experiments/"+experiment_name+"/trees/SAMPLPLUS"+string(argv[1])+".txt");
samplplus_eval_obj.BulkLoadData(data);
{ // KNN querys
json samplplus_json;
samplplus_json["model"]="SAMPLPLUS";
samplplus_json["point_class"] = point_class;
samplplus_json["point_file"] = point_file;
samplplus_json["config_id"] = string(argv[1]);
vector<Point> knn_query_result;
for(auto &k : k_values){
samplplus_eval_obj.ClearMetric();
samplplus_json["k"] = k;
auto samplplus_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: knn_queries){
knn_query_result = samplplus_eval_obj.KNNQuery(query,k);
}
auto samplplus_eval_end = std::chrono::high_resolution_clock::now();
samplplus_json["knn_query_time"]=chrono::duration_cast<chrono::nanoseconds>(samplplus_eval_end - samplplus_eval_start).count()/knn_queries.size();
samplplus_json["knn_query_scantime"] = samplplus_eval_obj.TimeSpentScanningPages()/knn_queries.size();
samplplus_json["knn_query_page_accessed"]=samplplus_eval_obj.NumPagesAcessed()/knn_queries.size();
samplplus_json["knn_query_points_scanned"]=samplplus_eval_obj.NumElementsScanned()/knn_queries.size();
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/KNN.json",ios_base::app);
o << samplplus_json << std::endl;
o.close();
cout<<"SAMPLPLUS knn "<<k<<endl;
}
}
cout<<"Done with SAMPLPLUS knn queries."<<endl;
{ // Point queries
json samplplus_json;
samplplus_json["model"]="SAMPLPLUS";
samplplus_json["point_class"] = point_class;
samplplus_json["point_file"] = point_file;
samplplus_json["config_id"] = string(argv[1]);
bool point_query_result;
auto samplplus_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: point_queries){
point_query_result = samplplus_eval_obj.PointQuery(query);
}
auto samplplus_eval_end = std::chrono::high_resolution_clock::now();
samplplus_json["point_query_time"]=chrono::duration_cast<chrono::nanoseconds>(samplplus_eval_end - samplplus_eval_start).count()/point_queries.size();
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/Point.json",ios_base::app);
o << samplplus_json << std::endl;
o.close();
}
cout<<"Done with SAMPLPLUS point queries."<<endl;
{ // Insert queries
json samplplus_json;
samplplus_json["model"]="SAMPLPLUS";
samplplus_json["point_class"] = point_class;
samplplus_json["point_file"] = point_file;
samplplus_json["config_id"] = string(argv[1]);
vector<uint64_t> insert_times;
vector<uint64_t> range_query_times;
uint32_t ins_ix=0;
for(int ins_epoch =0;ins_epoch<5;ins_epoch++){
auto samplplus_pointquery_eval_start = std::chrono::high_resolution_clock::now();
for(int j=0;j<insert_increments;j++,ins_ix++){
samplplus_eval_obj.InsertElement(insert_queries[ins_ix]);
}
auto samplplus_pointquery_eval_end = std::chrono::high_resolution_clock::now();
insert_times.push_back(chrono::duration_cast<chrono::nanoseconds>(samplplus_pointquery_eval_end - samplplus_pointquery_eval_start).count()/insert_increments);
auto samplplus_rq_eval_start = std::chrono::high_resolution_clock::now();
for(int i=0;i<insert_rq_size;i++){
vector<Point> range_query_result = samplplus_eval_obj.RangeQuery(queries[i]);
}
auto samplplus_rq_eval_end = std::chrono::high_resolution_clock::now();
range_query_times.push_back(chrono::duration_cast<chrono::nanoseconds>(samplplus_rq_eval_end - samplplus_rq_eval_start).count()/insert_rq_size);
}
samplplus_json["insert_query_time"]= insert_times;
samplplus_json["range_query_times"]= range_query_times;
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/Insert.json",ios_base::app);
o << samplplus_json << std::endl;
o.close();
}
cout<<"Done with SAMPLPLUS insert queries."<<endl;
}
cout<<"SAMPLPLUS DONE\n";
cout.flush();
return 0;
}
\ No newline at end of file
#!/bin/bash
cd LZC
g++ -O3 -std=c++2a -I./include/ evaluate_base_wazi_waziplus.cpp -o evaluate_base_wazi_waziplus.out
cd ../FLOOD
g++ -O3 -std=c++2a evaluate_flood.cpp -o evaluate_flood.out
cd ../QUILTS
g++ -O3 -std=c++2a evaluate_quilts.cpp -o evaluate_quilts.out
# ZPGM
cd ../ZPGM
g++ -O3 -std=c++2a evaluate_zpgm.cpp -I./include -mbmi2 -fopenmp -o evaluate_zpgm.out
\ No newline at end of file
#!/bin/bash
cd Datasets/Distributions/
mkdir Iberian
cd Iberian
wget https://helsinkifi-my.sharepoint.com/:u:/g/personal/sachith_ad_helsinki_fi/EXNMRxic4_BFqsCY_idMr-0B2kQBYrMZJWtXgw0vYE4LeA?e=yzTNRm?download=1
# wget https://helsinkifi-my.sharepoint.com/:f:/g/personal/sachith_ad_helsinki_fi/EoBlr5O-khJJmJU7Q81O_gYB65VvxkY-UbFvC13vzUZ3_A?e=zLhu1U
# wget https://helsinkifi-my.sharepoint.com/:f:/g/personal/sachith_ad_helsinki_fi/EtKH2ydWdIFNiouys_jPBdgB_jmzkCczU54MkDsnalNMnA?e=0qPJs8
cd ../../
\ No newline at end of file
import toml
from pathlib import Path
import numpy as np
import pandas as pd
from itertools import product
import math
import sys
##### Reading all the required data.
p=Path(".")
parsed_toml = toml.load("./config.toml")
########### QueryTimeBreakdown #######
experimentPath = p/'Datasets'/parsed_toml['data_folder']/'Experiments'
experimentPath.mkdir(parents=True,exist_ok=False)
with open(experimentPath/'fullconfig.toml', 'w') as f:
toml.dump(parsed_toml, f)
for model in ['plots','trees','result']:
(experimentPath/model).mkdir(parents=True)
config_dict = {}
config_dict['page_size'] = parsed_toml['page_size']
config_dict['num_samples'] = parsed_toml['random']['num_samples']
config_dict['data_dens_est_granularity'] = parsed_toml['lzc']['data_dens_est_granularity']
config_dict['query_dens_est_granularity'] = parsed_toml['lzc']['query_dens_est_granularity']
cnt = 1
confPath = experimentPath/'config'
confPath.mkdir(parents=True,exist_ok=True)
point_class_list = parsed_toml['datagen']['data_dist']
rsmi_build_list = []
for datasize in parsed_toml['datagen']['dataset_size']:
for point_class in point_class_list:
rsmi_build_list.append(cnt)
for selec in parsed_toml['datagen']['query_selectivity']:
config_dict['point_class'] = point_class
config_dict['point_file'] = str(datasize)
config_dict['query_file']='{}_{}'.format(point_class,str(int(selec*1000000)))
config_dict['knn_k_values'] = parsed_toml['knn']['knn_k_values']
config_dict['insert_points'] = point_class
with open(confPath/('%d.toml'%(cnt)), 'w') as f:
toml.dump(config_dict, f)
cnt+=1
rsmi_build_list.append('')
with open(experimentPath/'RSMI_build_list', 'w') as f:
f.write('\n'.join([str(a) for a in rsmi_build_list]))
sys.stdout.write(str(cnt-1))
sys.stdout.flush()
\ No newline at end of file
import toml
from pathlib import Path
import numpy as np
import pandas as pd
from itertools import product
import math
import matplotlib.pyplot as plt
def sampleClusteredQueries(query_dist,w,queriesFolder,filename):
allQueries = []
x,y = query_dist.T
allQueries = np.stack([np.clip(x-w,0,1),
np.clip(y-w,0,1),
np.clip(x+w,0,1),
np.clip(y+w,0,1)],axis=-1)
np.savetxt(queriesFolder/filename, allQueries, delimiter=' ',fmt='%.13f')
p=Path("./Datasets/Distributions")
target = Path(".")
parsed_toml = toml.load(target/"config.toml")
##### Reading all the required data.
datasets_arr = parsed_toml['datagen']['data_dist']
for dataset_name in datasets_arr:
data = np.loadtxt(p/dataset_name/'data_dist')
query = np.loadtxt(p/dataset_name/'query_dist')
np.random.shuffle(query)
dataset_folder = target/'Datasets'/parsed_toml['data_folder']
# Save data points
(dataset_folder/"DataPoints"/dataset_name).mkdir(parents=True,exist_ok=True)
for d in parsed_toml['datagen']['dataset_size']:
np.savetxt(dataset_folder/"DataPoints"/dataset_name/str(d), data[:(d*1000000)], delimiter=' ',fmt='%.13f')
# Range query
(dataset_folder/"Queries"/"RangeQueries").mkdir(parents=True,exist_ok=True)
for selec in parsed_toml['datagen']['query_selectivity']:
w = math.sqrt(selec/400.0)
query_file ='{}_{}'.format(dataset_name,str(int(selec*1000000)))
sampleClusteredQueries(query[:20000],w,dataset_folder/"Queries"/"RangeQueries",query_file)
# KNN query
(dataset_folder/"Queries"/"KnnQueries").mkdir(parents=True,exist_ok=True)
knn_queries = data[np.random.choice(np.arange(data.shape[0]),10000)]
np.savetxt(dataset_folder/"Queries"/"KnnQueries"/dataset_name, knn_queries, delimiter=' ',fmt='%.13f')
# Point query
(dataset_folder/"Queries"/"PointQueries").mkdir(parents=True,exist_ok=True)
pnt_queries = data[np.random.choice(np.arange(data.shape[0]),50000)]
np.savetxt(dataset_folder/"Queries"/"PointQueries"/dataset_name, pnt_queries, delimiter=' ',fmt='%.13f')
# Insert query (The amount to insert is managed by the eval scripts)
(dataset_folder/"Queries"/"InsertQueries").mkdir(parents=True,exist_ok=True)
insert_queries = data[64000000:]
np.savetxt(dataset_folder/"Queries"/"InsertQueries"/dataset_name, insert_queries, delimiter=' ',fmt='%.13f')
\ No newline at end of file
/**
* @file train_test_zindex.cpp
* @author Sachith (sachith.pai@helsinki.fi)
* @brief File to train and test the zindex.
* @version 0.1
* @date 2022-05-04
*
* @copyright Copyright (c) 2022
*
*/
#include<iostream>
#include <vector>
#include <algorithm>
#include <time.h>
#include <random>
#include <chrono>
#include <fstream>
#include <stdlib.h>
#include<set>
#include<map>
#include<string>
#include"quilts.h"
#include "../toml11-master/toml.hpp"
#include "../json.hpp"
using namespace std;
using json = nlohmann::json; // using this to dump various logs.
const string data_path = "./Datasets/";
#define pddii pair<pair<double_t,double_t>,pair<uint32_t,uint32_t>>
bool sortbyfirst(const pddii &a, const pddii &b){return (a.first.first < b.first.first);}
bool sortbysecond(const pddii &a, const pddii &b){return (a.first.second < b.first.second);}
void rankspaceprojection(vector<pddii> &arr){
sort(arr.begin(), arr.end(), sortbyfirst);
for(int i=0;i<arr.size();i++)
arr[i].second.first = i;
sort(arr.begin(), arr.end(), sortbysecond);
for(int i=0;i<arr.size();i++)
arr[i].second.second = i;
}
int main(int argc, char* argv[])
{
const auto data_folder = string(argv[2]);
const auto config = toml::parse(data_path+data_folder+"/Experiments/config/"+string(argv[1])+".toml");
const auto point_class = toml::find<std::string>(config, "point_class");
const auto point_file = toml::find<std::string>(config, "point_file");
const auto query_file = toml::find<std::string>(config, "query_file");
cout.precision(17);
vector<pddii> data_raw;
vector<double_t> x_values;
vector<double_t> y_values;
double_t a, b, c, d;
ifstream pointsfile(data_path+data_folder+"/DataPoints/"+point_class+"/"+point_file);
while ( pointsfile >> a >> b){
data_raw.push_back(make_pair(make_pair(a,b),make_pair(0,0)));
x_values.push_back(a);
y_values.push_back(b);
}
pointsfile.close();
rankspaceprojection(data_raw);
sort(x_values.begin(), x_values.end());
sort(y_values.begin(), y_values.end());
vector<Point> data;
for(int i=0;i<data_raw.size();i++){
data.push_back(Point(data_raw[i].second.first,data_raw[i].second.second));
}
vector<pair<Point,Point>> queries;
ifstream queriesfile(data_path+data_folder+"/Queries/RangeQueries/"+query_file);
while (queriesfile >> a >> b >> c >> d){
uint32_t rank_a = (lower_bound(x_values.begin(),x_values.end(),a)-x_values.begin());
uint32_t rank_b = (lower_bound(y_values.begin(),y_values.end(),b)-y_values.begin());
uint32_t rank_c = (lower_bound(x_values.begin(),x_values.end(),c)-x_values.begin());
uint32_t rank_d = (lower_bound(y_values.begin(),y_values.end(),d)-y_values.begin());
queries.push_back(make_pair(Point(rank_a,rank_b),Point(rank_c,rank_d)));
}
queriesfile.close();
vector<Point> knn_queries;
ifstream knn_queriesfile(data_path+data_folder+"/Queries/KnnQueries/"+point_class);
while (knn_queriesfile >> a >> b){
uint32_t rank_a = (lower_bound(x_values.begin(),x_values.end(),a)-x_values.begin());
uint32_t rank_b = (lower_bound(y_values.begin(),y_values.end(),b)-y_values.begin());
knn_queries.push_back(Point(rank_a,rank_b));
}
knn_queriesfile.close();
vector<uint32_t> k_values = toml::find<std::vector<uint32_t>>(config, "knn_k_values");
vector<Point> point_queries;
ifstream point_queriesfile(data_path+data_folder+"/Queries/PointQueries/"+point_class);
while (point_queriesfile >> a >> b){
uint32_t rank_a = (lower_bound(x_values.begin(),x_values.end(),a)-x_values.begin());
uint32_t rank_b = (lower_bound(y_values.begin(),y_values.end(),b)-y_values.begin());
point_queries.push_back(Point(rank_a,rank_b));
}
point_queriesfile.close();
uint32_t insert_increments = uint32_t(data.size()*0.1);
uint32_t insert_rq_size = uint32_t(queries.size()*0.05);
vector<Point> insert_queries;
ifstream insert_queriesfile(data_path+data_folder+"/Queries/InsertQueries/"+point_class);
while (insert_queriesfile >> a >> b){
uint32_t rank_a = (lower_bound(x_values.begin(),x_values.end(),a)-x_values.begin());
uint32_t rank_b = (lower_bound(y_values.begin(),y_values.end(),b)-y_values.begin());
insert_queries.push_back(Point(rank_a,rank_b));
}
insert_queriesfile.close();
int page_size = toml::find<std::int32_t>(config, "page_size");
cout<<"Finished reading data n queries \n";
cout.flush();
auto quilts_train_start = std::chrono::high_resolution_clock::now();
Quilts quilts_obj = Quilts(data,queries,page_size);
auto quilts_train_end = std::chrono::high_resolution_clock::now();
cout<<"Finished training QUILTS"<<endl;
//############ QUILTS ############
{ //range query
json quilts_json;
quilts_json["model"]="QUILTS";
quilts_json["query_file"] = query_file;
quilts_json["point_class"] = point_class;
quilts_json["point_file"] = point_file;
quilts_json["build_time"] = chrono::duration_cast<chrono::seconds>(quilts_train_end - quilts_train_start).count();
quilts_json["config_id"] = string(argv[1]);
uint64_t result_size =0;
auto quilts_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: queries){
vector<Point> range_query_result = quilts_obj.RangeQuery(query);
result_size+=range_query_result.size();
}
auto quilts_eval_end = std::chrono::high_resolution_clock::now();
quilts_json["range_result_size"]=result_size;
quilts_json["page_count"]=quilts_obj.page_cnt_;
quilts_json["node_count"]=quilts_obj.node_cnt_;
quilts_json["index_size"]=quilts_obj.ModelSize();
quilts_json["range_query_time"] = chrono::duration_cast<chrono::nanoseconds>(quilts_eval_end - quilts_eval_start).count()/queries.size();
quilts_json["range_query_scantime"] = quilts_obj.TimeSpentScanningPages()/queries.size();
quilts_json["range_query_page_accessed"]=quilts_obj.NumPagesAcessed()/queries.size();
quilts_json["range_query_points_scanned"]=quilts_obj.NumElementsScanned()/queries.size();
ofstream o(data_path+data_folder+"/Experiments/result/Range.json",ios_base::app);
o << quilts_json << std::endl;
o.close();
}
cout<<"Finished Range query QUILTS"<<endl;
cout<<"Starting Knn"<<endl;
{ // KNN querys
json quilts_json;
quilts_json["model"]="QUILTS";
quilts_json["point_class"] = point_class;
quilts_json["point_file"] = point_file;
quilts_json["config_id"] = string(argv[1]);
vector<Point> knn_query_result;
for(auto &k : k_values){
quilts_json["k"] = k;
quilts_obj.ClearMetric();
auto quilts_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: knn_queries){
knn_query_result = quilts_obj.KNNQuery(query,k);
}
auto quilts_eval_end = std::chrono::high_resolution_clock::now();
quilts_json["knn_query_time"]=chrono::duration_cast<chrono::nanoseconds>(quilts_eval_end - quilts_eval_start).count()/knn_queries.size();
quilts_json["knn_query_scantime"] = quilts_obj.TimeSpentScanningPages()/knn_queries.size();
quilts_json["knn_query_page_accessed"]=quilts_obj.NumPagesAcessed()/knn_queries.size();
quilts_json["knn_query_points_scanned"]=quilts_obj.NumElementsScanned()/knn_queries.size();
ofstream o(data_path+data_folder+"/Experiments/result/KNN.json",ios_base::app);
o << quilts_json << std::endl;
o.close();
}
}
{ // Point queries
json quilts_json;
quilts_json["model"]="QUILTS";
quilts_json["point_class"] = point_class;
quilts_json["point_file"] = point_file;
quilts_json["config_id"] = string(argv[1]);
bool point_query_result;
auto quilts_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: point_queries){
point_query_result = quilts_obj.PointQuery(query);
}
auto quilts_eval_end = std::chrono::high_resolution_clock::now();
quilts_json["point_query_time"]=chrono::duration_cast<chrono::nanoseconds>(quilts_eval_end - quilts_eval_start).count()/point_queries.size();
ofstream o(data_path+data_folder+"/Experiments/result/Point.json",ios_base::app);
o << quilts_json << std::endl;
o.close();
}
{ // Insert queries
json quilts_json;
quilts_json["model"]="QUILTS";
quilts_json["point_class"] = point_class;
quilts_json["point_file"] = point_file;
quilts_json["config_id"] = string(argv[1]);
vector<uint64_t> insert_times;
vector<uint64_t> range_query_times;
uint32_t ins_ix=0;
for(int ins_epoch =0;ins_epoch<5;ins_epoch++){
auto quilts_eval_start = std::chrono::high_resolution_clock::now();
for(int j=0;j<insert_increments;j++,ins_ix++){
quilts_obj.InsertElement(insert_queries[ins_ix]);
}
auto quilts_eval_end = std::chrono::high_resolution_clock::now();
insert_times.push_back(chrono::duration_cast<chrono::nanoseconds>(quilts_eval_end - quilts_eval_start).count()/insert_increments);
auto quilts_rq_eval_start = std::chrono::high_resolution_clock::now();
for(int i=0;i<insert_rq_size;i++){
vector<Point> range_query_result = quilts_obj.RangeQuery(queries[i]);
}
auto quilts_rq_eval_end = std::chrono::high_resolution_clock::now();
range_query_times.push_back(chrono::duration_cast<chrono::nanoseconds>(quilts_rq_eval_end - quilts_rq_eval_start).count()/insert_rq_size);
}
quilts_json["insert_query_time"]= insert_times;
quilts_json["range_query_times"]= range_query_times;
ofstream o(data_path+data_folder+"/Experiments/result/Insert.json",ios_base::app);
o << quilts_json << std::endl;
o.close();
}
cout<<"QUILTS DONE\n";
cout.flush();
return 0;
}
\ No newline at end of file
This diff is collapsed.
CC=g++ -O3 -std=c++14 # -D_GLIBCXX_USE_CXX11_ABI=0
SRCS=$(wildcard *.cpp */*.cpp)
OBJS=$(patsubst %.cpp, %.o, $(SRCS))
# Update the libtorch sources in the following lines
INCLUDE = -I/wrk-vakka/users/sachith/libtorch/include -I/wrk-vakka/users/sachith/libtorch/include/torch/csrc/api/include
LIB += -L/wrk-vakka/users/sachith/libtorch/lib -ltorch_cpu -lc10 -lpthread
FLAG = -Wl,-rpath=/wrk-vakka/users/sachith/libtorch/lib
NAME=$(wildcard *.cpp)
TARGET=$(patsubst %.cpp, %, $(NAME))
$(TARGET):$(OBJS)
$(CC) -o $@.out $^ $(INCLUDE) $(LIB) $(FLAG)
%.o:%.cpp
$(CC) -o $@ -c $< -g $(INCLUDE)
clean:
rm -rf $(TARGET) $(OBJS)
\ No newline at end of file
# RSMI
**NOTE: This code is based on original author's code repository:** https://github.com/Liuguanli/RSMI
Please refer the original code for any clarifications and credit the author when suitable.
## How to use
### 1. Required libraries
#### LibTorch
homepage: https://pytorch.org/get-started/locally/
#### boost
homepage: https://www.boost.org/
#### 2. Change Makefile
Change */wrk-vakka/users/sachith/libtorch* to your own path.
#### 3. Run
```bash
make clean
make
```
### Paper
> Jianzhong Qi, Guanli Liu, Christian S. Jensen, Lars Kulik: [Effectively Learning Spatial Indices](http://www.vldb.org/pvldb/vol13/p2341-qi.pdf). Proc. VLDB Endow. 13(11): 2341-2354 (2020)
```tex
@article{DBLP:journals/pvldb/QiLJK20,
author = {Jianzhong Qi and
Guanli Liu and
Christian S. Jensen and
Lars Kulik},
title = {Effectively Learning Spatial Indices},
journal = {{PVLDB}}
volume = {13},
number = {11},
pages = {2341--2354},
year = {2020},
url = {http://www.vldb.org/pvldb/vol13/p2341-qi.pdf},
}
```
This diff is collapsed.