Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • ads/learned-zindex
1 result
Show changes
Commits on Source (2)
/**
* @file train_test_zindex.cpp
* @author Sachith (sachith.pai@helsinki.fi)
* @brief File to train and test the flood index
* @version 0.1
* @date 2022-05-04
*
* @copyright Copyright (c) 2022
*
*/
#include<iostream>
#include <vector>
#include <algorithm>
#include <time.h>
#include <random>
#include <chrono>
#include <fstream>
#include <stdlib.h>
#include<set>
#include<map>
#include<string>
#include"floodlite.h"
#include "../toml11-master/toml.hpp"
#include "../json.hpp"
using namespace std;
using json = nlohmann::json; // using this to dump various logs.
#define addeb if(0)
const string data_path = "../Datasets/";
int main(int argc, char* argv[])
{
const auto data_folder = string(argv[2]);
const auto experiment_name = string(argv[3]);
const auto config = toml::parse(data_path+data_folder+"/Experiments/"+experiment_name+"/config/"+string(argv[1])+".toml");
const auto point_class = toml::find<std::string>(config, "point_class");
const auto point_file = toml::find<std::string>(config, "point_file");
const auto query_file = toml::find<std::string>(config, "query_file");
cout.precision(17);
vector<Point> data;
double_t a, b, c, d;
ifstream pointsfile(data_path+data_folder+"/DataPoints/"+point_class+"/"+point_file);
while ( pointsfile >> a >> b)
data.push_back(Point(a,b));
pointsfile.close();
cout<<data_path+data_folder+"/DataPoints/"+point_class+"/"+point_file<<endl;
uint32_t insert_increments = uint32_t(data.size()*0.1);
cout<<"Finishe reading data "<<data.size()<<endl;
vector<pair<Point,Point>> queries;
ifstream queriesfile(data_path+data_folder+"/Queries/RangeQueries/"+query_file);
while (queriesfile >> a >> b >> c >> d)
queries.push_back(make_pair(Point(a,b),Point(c,d)));
queriesfile.close();
uint32_t insert_rq_size = uint32_t(queries.size()*0.05);
int page_size = toml::find<std::int32_t>(config, "page_size");
cout<<"Finishe reading data n queries "<<data.size()<<" , "<<queries.size()<<"\n";
cout.flush();
auto flood_train_start = std::chrono::high_resolution_clock::now();
FloodLite flood_obj = FloodLite(data,queries,page_size);
auto flood_train_end = std::chrono::high_resolution_clock::now();
cout<<"Finished training FLOOD"<<endl;
//############ FLOOD ############
{
json flood_json;
flood_json["model"]="FLOOD";
flood_json["query_file"] = query_file;
flood_json["point_class"] = point_class;
flood_json["point_file"] = point_file;
flood_json["build_time"] = chrono::duration_cast<chrono::seconds>(flood_train_end - flood_train_start).count();
flood_json["config_id"] = string(argv[1]);
uint64_t result_size =0;
auto flood_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: queries){
vector<Point> range_query_result = flood_obj.RangeQuery(query);
result_size+=range_query_result.size();
}
auto flood_eval_end = std::chrono::high_resolution_clock::now();
flood_json["range_result_size"]=result_size;
flood_json["page_count"]=flood_obj.page_cnt_;
flood_json["node_count"]=0;
flood_json["index_size"]=flood_obj.ModelSize();
flood_json["range_query_time"] = chrono::duration_cast<chrono::nanoseconds>(flood_eval_end - flood_eval_start).count()/queries.size();
flood_json["range_query_scantime"] = flood_obj.TimeSpentScanningPages()/queries.size();
flood_json["range_query_page_accessed"]=flood_obj.NumPagesAcessed()/queries.size();
flood_json["range_query_points_scanned"]=flood_obj.NumElementsScanned()/queries.size();
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/Range.json",ios_base::app);
o << flood_json << std::endl;
o.close();
}
cout<<"argv[4] :"<<argv[4]<<endl;
if(atoi(argv[4])){
vector<Point> knn_queries;
ifstream knn_queriesfile(data_path+data_folder+"/Queries/KnnQueries/"+point_class);
while (knn_queriesfile >> a >> b)
knn_queries.push_back(Point(a,b));
knn_queriesfile.close();
vector<uint32_t> k_values = toml::find<std::vector<uint32_t>>(config, "knn_k_values");
vector<Point> point_queries;
ifstream point_queriesfile(data_path+data_folder+"/Queries/PointQueries/"+point_class);
while (point_queriesfile >> a >> b)
point_queries.push_back(Point(a,b));
point_queriesfile.close();
vector<Point> insert_queries;
ifstream insert_queriesfile(data_path+data_folder+"/Queries/InsertQueries/"+point_class);
while (insert_queriesfile >> a >> b)
insert_queries.push_back(Point(a,b));
insert_queriesfile.close();
{ // KNN querys
json flood_json;
flood_json["model"]="FLOOD";
flood_json["point_class"] = point_class;
flood_json["point_file"] = point_file;
flood_json["config_id"] = string(argv[1]);
vector<Point> knn_query_result;
for(auto &k : k_values){
flood_obj.ClearMetric();
auto flood_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: knn_queries){
knn_query_result = flood_obj.KNNQuery(query,k);
}
auto flood_eval_end = std::chrono::high_resolution_clock::now();
flood_json["k"]=k;
flood_json["knn_query_time"]=chrono::duration_cast<chrono::nanoseconds>(flood_eval_end - flood_eval_start).count()/knn_queries.size();
flood_json["knn_query_scantime"] = flood_obj.TimeSpentScanningPages()/knn_queries.size();
flood_json["knn_query_page_accessed"]=flood_obj.NumPagesAcessed()/knn_queries.size();
flood_json["knn_query_points_scanned"]=flood_obj.NumElementsScanned()/knn_queries.size();
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/KNN.json",ios_base::app);
o << flood_json << std::endl;
o.close();
}
}
{ // Point queries
json flood_json;
flood_json["model"]="FLOOD";
flood_json["point_class"] = point_class;
flood_json["point_file"] = point_file;
flood_json["config_id"] = string(argv[1]);
bool point_query_result;
auto flood_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: point_queries){
point_query_result = flood_obj.PointQuery(query);
}
auto flood_eval_end = std::chrono::high_resolution_clock::now();
flood_json["point_query_time"]=chrono::duration_cast<chrono::nanoseconds>(flood_eval_end - flood_eval_start).count()/point_queries.size();
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/Point.json",ios_base::app);
o << flood_json << std::endl;
o.close();
}
cout<<"Point and KNN query"<<endl;
{ // Insert queries
json flood_json;
flood_json["model"]="FLOOD";
flood_json["point_class"] = point_class;
flood_json["point_file"] = point_file;
flood_json["config_id"] = string(argv[1]);
vector<uint64_t> insert_times;
vector<uint64_t> range_query_times;
uint32_t ins_ix=0;
for(int ins_epoch =0;ins_epoch<5;ins_epoch++){
auto flood_eval_start = std::chrono::high_resolution_clock::now();
for(int j=0;j<insert_increments;j++,ins_ix++){
// cout<<insert_queries[ins_ix].x_<<" "<<insert_queries[ins_ix].y_<<endl;
flood_obj.InsertElement(insert_queries[ins_ix]);
}
auto flood_eval_end = std::chrono::high_resolution_clock::now();
insert_times.push_back(chrono::duration_cast<chrono::nanoseconds>(flood_eval_end - flood_eval_start).count()/insert_increments);
// cout<<" ############## EPOCH: "<<ins_epoch<<" Done ############## "<<endl;
auto flood_rq_eval_start = std::chrono::high_resolution_clock::now();
for(int i=0;i<insert_rq_size;i++){
vector<Point> range_query_result = flood_obj.RangeQuery(queries[i]);
}
auto flood_rq_eval_end = std::chrono::high_resolution_clock::now();
range_query_times.push_back(chrono::duration_cast<chrono::nanoseconds>(flood_rq_eval_end - flood_rq_eval_start).count()/insert_rq_size);
}
flood_json["insert_query_time"]= insert_times;
flood_json["range_query_times"]= range_query_times;
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/Insert.json",ios_base::app);
o << flood_json << std::endl;
o.close();
}
}
cout<<"FLOOD DONE\n";
cout.flush();
return 0;
}
\ No newline at end of file
/**
* @file flood.h
* @author Sachith (sachith.pai@helsinki.fi)
* @brief
* @version 0.1
* @date 2022-10-03
*
* @copyright Copyright (c) 2022
*
*/
#ifndef FLOODLITE_H
#define FLOODLITE_H
#include<cstdint>
#include<cmath>
#include<vector>
#include<algorithm>
#include<iostream>
#include<limits>
#include <chrono>
#include<list>
#include<cassert>
const double_t eps=1e-12;
class Point{
public:
double_t x_,y_;
Point(double_t x, double_t y):x_(x),y_(y){}
bool operator==(const Point &other_point){
return (x_ == other_point.x_ and y_ == other_point.y_);
}
};
class XComparatorPoint {
public:
bool operator()(const Point& a,const Point& b)
{
return (a.x_<b.x_);
}
}x_sort_order;
class YComparatorPoint {
public:
bool operator()(const Point& a,const Point& b)
{
return (a.y_<b.y_);
}
}y_sort_order;
/*
* @brief A comparator class for sorting points according to distance for KNN queries.
*/
class DistanceComparator {
public:
Point orig;
DistanceComparator(const Point &point):orig(point){ }
bool operator()(const Point& a,const Point& b)
{
double_t distance_a = (orig.x_-a.x_)*(orig.x_-a.x_) + (orig.y_-a.y_)*(orig.y_-a.y_);
double_t distance_b = (orig.x_-b.x_)*(orig.x_-b.x_) + (orig.y_-b.y_)*(orig.y_-b.y_);
return (distance_a<distance_b);
}
double_t Distance(const Point& a)
{
return sqrt((orig.x_-a.x_)*(orig.x_-a.x_) + (orig.y_-a.y_)*(orig.y_-a.y_));
}
};
class Page{
public:
Point low_, high_;
std::vector<Point> page_data_;
Page(double_t lx, double_t ly, double_t hx, double_t hy):low_(Point(lx,ly)),high_(Point(hx,hy)){}
double_t Area(){
return (high_.x_-low_.x_)*(high_.y_-low_.y_);
}
double_t DistanceToEdge(const Point& other)
{
return std::min( std::min(abs(other.x_-low_.x_),abs(other.y_-low_.y_)),
std::min(abs(other.x_-high_.x_),abs(other.y_-high_.y_)));
}
};
class FloodLite
{
public:
bool grid_orientation_; // False = split along X first
uint32_t num_grid_splits_; // number of splits along the first dimension
uint32_t page_size_,num_datapoints_;
std::vector<double_t> grid_split_; // we store the starting location current column
std::vector<std::vector<double_t>> page_split_; //For each column we store the start point of a page.
std::vector<std::vector<std::list<Page>::iterator>> page_iters_;
// Each page
// std::vector<std::vector<Point>> page_array_;
std::list<Page> page_array_;
// *METRICS*
uint64_t metric_time_spent_scanning_pages_{}; // chrono::duration to store amount of time spent scanning.
uint64_t metric_num_pages_accessed_{};
uint64_t metric_num_elements_scanned_{};
uint32_t page_cnt_{};
/**
* @brief We train Flood by sampling a set of configurations (num_grid_splits_, grid_orientation) and evaluating the
* execution time for a subsample of range queries. The optimal grid is selected as the partition.
*/
FloodLite(std::vector<Point> &datapoints, std::vector<std::pair<Point,Point>> &queries,uint32_t page_size):page_size_(page_size){
std::random_shuffle(queries.begin(),queries.end());
uint32_t M = queries.size();
num_datapoints_ = datapoints.size();
uint64_t best_sample_query_time = std::numeric_limits<uint64_t>::max();
bool best_orientation;
uint32_t best_num_grid_split;
//trying every config generated from query ranges.
std::vector<std::pair<bool,uint32_t>> candidate_configs = ConfigurationsFromQueries(queries);
std::random_shuffle(candidate_configs.begin(),candidate_configs.end());
std::cout<<" There are "<<candidate_configs.size()<<" candidate configs."<<std::endl;
uint32_t num_configs_to_try = 30;
std::vector<std::pair<Point,Point>> query_subsample(queries.begin(),queries.begin()+uint32_t(M*0.005));
for(std::pair<bool,uint32_t> &conf: candidate_configs){
grid_orientation_ = conf.first;
num_grid_splits_ = conf.second;
BuildFlood(datapoints);
auto query_start = std::chrono::high_resolution_clock::now();
for(std::pair<Point,Point> &q: query_subsample)
RangeQuery(q);
auto query_end = std::chrono::high_resolution_clock::now();
uint64_t query_time = std::chrono::duration_cast<std::chrono::nanoseconds>(query_end-query_start).count();
// std::cout<<"TRIED ONE "<<best_sample_query_time<<" "<<query_time<<std::endl;
if(query_time<best_sample_query_time){
best_num_grid_split = conf.second;
best_orientation = conf.first;
best_sample_query_time = query_time;
}
--num_configs_to_try;
if(num_configs_to_try==0)
break;
}
grid_orientation_ = best_orientation;
num_grid_splits_ = best_num_grid_split;
std::cout<<"grid_orientation_:"<<grid_orientation_<<" num_grid_splits_:"<<num_grid_splits_<<std::endl;
BuildFlood(datapoints);
//Resetting metrics to wipe out values from training;
metric_num_pages_accessed_=0;
metric_num_elements_scanned_=0;
metric_time_spent_scanning_pages_=0;
}
/**
* @brief Construct a new Flood Lite object by loading the config from a saved file.
*/
FloodLite(std::vector<Point> &datapoints,uint32_t page_size,std::string filename):page_size_(page_size){
std::ifstream fin(filename);
fin>>grid_orientation_>>num_grid_splits_;
BuildFlood(datapoints);
}
/**
* @brief Function to extract 'orientation + num_split' candidates to select from.
*/
std::vector<std::pair<bool,uint32_t>> ConfigurationsFromQueries(std::vector<std::pair<Point,Point>> &queries){
std::vector<std::pair<bool,uint32_t>> configs;
uint32_t split_count_lower_x=std::numeric_limits<uint32_t>::max(),split_count_higher_x=0;
uint32_t split_count_lower_y=std::numeric_limits<uint32_t>::max(),split_count_higher_y=0;
for(auto &q : queries){
double_t x_range = q.second.x_ - q.first.x_;
double_t y_range = q.second.y_ - q.first.y_;
uint32_t splits_per_range_x = uint32_t(std::ceil(1.0/x_range));
uint32_t splits_per_range_y = uint32_t(std::ceil(1.0/y_range));
split_count_higher_x = std::max(split_count_higher_x,splits_per_range_x);
split_count_higher_y = std::max(split_count_higher_y,splits_per_range_x);
split_count_lower_x = std::min(split_count_lower_x,splits_per_range_x);
split_count_lower_y = std::min(split_count_lower_y,splits_per_range_x);
}
for(uint32_t spl=split_count_lower_x;spl<=split_count_higher_x;spl++)
configs.push_back(std::make_pair(false,spl));
for(uint32_t spl=split_count_lower_y;spl<=split_count_higher_y;spl++)
configs.push_back(std::make_pair(true,spl));
return configs;
}
/**
* @brief Function to build a Flood model based on configuration (grid_orientiation_, num_grid_splits_)
*/
void BuildFlood(std::vector<Point> &datapoints){
//clear the any existing structure.
grid_split_.clear();
page_split_.clear();
page_iters_.clear();
page_array_.clear();
page_cnt_ =0;
uint32_t effective_page_size = uint32_t(0.9 *page_size_);
if(grid_orientation_)
std::sort(datapoints.begin(),datapoints.end(),y_sort_order);
else
std::sort(datapoints.begin(),datapoints.end(),x_sort_order);
//creating the vector of vectors
page_split_.resize(num_grid_splits_);
page_iters_.resize(num_grid_splits_);
uint32_t max_num_elements_per_column = std::ceil(datapoints.size()*1.0/num_grid_splits_);
uint32_t col_start = 0;
// loop to generate all column data.
for(size_t col_id = 0;col_id < num_grid_splits_;col_id++){
uint32_t num_elements_in_column = std::ceil((datapoints.size()*1.0-col_start)/(num_grid_splits_-col_id));
uint32_t col_end_pt_id= std::min(col_start+num_elements_in_column,uint32_t(datapoints.size()));
if(grid_orientation_){
//sorting the data points within column along second dimension.
double_t page_y_low = datapoints[col_start].y_, page_y_high = datapoints[col_end_pt_id-1].y_;
if(col_id==0)
page_y_low=0.0;
else if(col_id==num_grid_splits_-1)
page_y_high = 1.0;
//storing column upper bound
grid_split_.push_back(page_y_high+eps);
std::sort(datapoints.begin()+col_start,datapoints.begin()+col_end_pt_id,x_sort_order);
// loop for each page.
for(uint32_t page_start=col_start;page_start<col_end_pt_id;page_start+=effective_page_size){
uint32_t page_end_element_id = std::min(std::min(page_start+effective_page_size,col_end_pt_id),uint32_t(datapoints.size()));
assert(datapoints[page_start].x_ <= datapoints[page_end_element_id-1].x_);
assert(page_y_low <= page_y_high);
page_array_.push_back(Page(datapoints[page_start].x_,page_y_low,datapoints[page_end_element_id-1].x_,page_y_high));
std::list<Page>::iterator curr_page_iter = std::prev(page_array_.end());
(*curr_page_iter).page_data_.assign(datapoints.begin()+page_start,datapoints.begin()+page_end_element_id);
page_split_[col_id].push_back(datapoints[page_end_element_id-1].x_);
page_iters_[col_id].push_back(curr_page_iter);
page_cnt_++;
}
}
else{
//sorting the data points within column along second dimension.
double_t page_x_low = datapoints[col_start].x_, page_x_high = datapoints[col_end_pt_id-1].x_;
if(col_id==0)
page_x_low=0.0;
else if(col_id==num_grid_splits_-1)
page_x_high = 1.0;
//storing column upper bound
grid_split_.push_back(page_x_high+eps);
std::sort(datapoints.begin()+col_start,datapoints.begin()+col_end_pt_id,y_sort_order);
// loop for each page.
for(uint32_t page_start=col_start;page_start<col_end_pt_id;page_start+=effective_page_size){
uint32_t page_end_element_id = std::min(std::min(page_start+effective_page_size,col_end_pt_id),uint32_t(datapoints.size()));
assert(page_x_low <= page_x_high);
assert(datapoints[page_start].y_ <= datapoints[page_end_element_id-1].y_);
page_array_.push_back(Page(page_x_low,datapoints[page_start].y_,page_x_high,datapoints[page_end_element_id-1].y_));
std::list<Page>::iterator curr_page_iter = std::prev(page_array_.end());
(*curr_page_iter).page_data_.assign(datapoints.begin()+page_start,datapoints.begin()+page_end_element_id);
page_split_[col_id].push_back(datapoints[page_end_element_id-1].y_);
page_iters_[col_id].push_back(curr_page_iter);
page_cnt_++;
}
}
col_start+=num_elements_in_column;
}
}
void SaveFlood(std::string file){
std::ofstream fout(file);
fout<<grid_orientation_<<" "<<num_grid_splits_;
fout.close();
}
/**
* @brief The range query algorithm which
* - Finds the relevant columns
* - Refines the relevant pages within each of those columns
* - Scans and filters points from these pages.
*/
std::vector<Point> RangeQuery(std::pair<Point,Point> query){
std::vector<Point> query_result;
//using oriented_query, where x_ & y_ values are swapped based on grid orientation.
std::pair<Point,Point> oriented_query = query;
if(grid_orientation_){
std::swap(oriented_query.first.x_,oriented_query.first.y_);
std::swap(oriented_query.second.x_,oriented_query.second.y_);
}
std::vector<double_t>::iterator first_col=std::upper_bound(grid_split_.begin(),grid_split_.end(),(oriented_query.first.x_));
std::vector<double_t>::iterator last_col=std::upper_bound(grid_split_.begin(),grid_split_.end(),(oriented_query.second.x_));
uint32_t col_id_st = first_col - grid_split_.begin();
uint32_t col_id_last = last_col - grid_split_.begin();
col_id_last = (col_id_last>=num_grid_splits_)?num_grid_splits_-1:col_id_last;
for(uint32_t col_ix = col_id_st;col_ix<=col_id_last;col_ix++){
std::vector<double_t>::iterator first_page=std::upper_bound(page_split_[col_ix].begin(),page_split_[col_ix].end(),oriented_query.first.y_);
if (first_page == page_split_[col_ix].end())
continue;
uint32_t first_page_id = std::distance(page_split_[col_ix].begin(),first_page); // id in terms of within the page_split_ internal vector array.
std::vector<double_t>::iterator last_page=std::upper_bound(page_split_[col_ix].begin(),page_split_[col_ix].end(),oriented_query.second.y_);
uint32_t last_page_id = (last_page == page_split_[col_ix].end())?page_split_[col_ix].size()-1:std::distance(page_split_[col_ix].begin(),last_page); // if it returns an end iterator reset it to the last element.
//scanning each page that overlaps with query region.
auto page_scan_phase_start = std::chrono::high_resolution_clock::now();
metric_num_pages_accessed_ += last_page_id - first_page_id +1;
for(uint32_t page_ix = first_page_id;page_ix<=last_page_id;page_ix++){
std::list<Page>::iterator curr_page_iter = page_iters_[col_ix][page_ix];
metric_num_elements_scanned_ += (*curr_page_iter).page_data_.size();
for(Point &p: (*curr_page_iter).page_data_)
if(p.x_ >= query.first.x_ && p.x_<query.second.x_ && p.y_ >= query.first.y_ && p.y_<query.second.y_)
query_result.push_back(p);
}
auto page_scan_phase_end = std::chrono::high_resolution_clock::now();
metric_time_spent_scanning_pages_+=std::chrono::duration_cast<std::chrono::nanoseconds>(page_scan_phase_end-page_scan_phase_start).count();
}
return query_result;
}
std::pair<uint32_t,uint32_t> ColIdPageIdForPoint(Point pnt){
Point oriented_p = pnt;
if(grid_orientation_)
std::swap(oriented_p.x_,oriented_p.y_);
// std::cout<<"ColIdPageIdForPoint oriented point:"<<oriented_p.x_<<" "<<oriented_p.y_<<std::endl;
std::vector<double_t>::iterator col_iter=std::upper_bound(grid_split_.begin(),grid_split_.end(),(oriented_p.x_));
uint32_t col_ix = col_iter - grid_split_.begin();
// std::cout<<"ColIdPageIdForPoint col_ix:"<<col_ix<<std::endl;
std::vector<double_t>::iterator page_iter=std::upper_bound(page_split_[col_ix].begin(),page_split_[col_ix].end(),oriented_p.y_);
if (page_iter == page_split_[col_ix].end())
page_iter = std::prev(page_iter);
uint32_t page_ix = page_iter - page_split_[col_ix].begin();
// std::cout<<"ColIdPageIdForPoint page_ix:"<<page_ix<<std::endl;
return std::make_pair(col_ix,page_ix);
}
bool PointQuery(Point pnt){
std::pair<uint32_t,uint32_t> col_page_id = ColIdPageIdForPoint(pnt);
std::list<Page>::iterator curr_page_iter = page_iters_[col_page_id.first][col_page_id.second];
for(Point &p: (*curr_page_iter).page_data_)
if(pnt.x_ == p.x_ and pnt.y_ == p.y_)
return true;
return false;
}
std::vector<Point> KNNQuery(Point query_point,uint32_t k){
std::vector<Point> queryResult;
std::pair<uint32_t,uint32_t> col_page_id = ColIdPageIdForPoint(query_point);
std::list<Page>::iterator curr_page_iter = page_iters_[col_page_id.first][col_page_id.second];
double_t area = (*curr_page_iter).Area();
uint32_t num_elems = (*curr_page_iter).page_data_.size();
double_t density =(num_elems*1.0/num_datapoints_)/area;
double_t pi = 2 * acos(0.0);
double_t dist ;
DistanceComparator distance_sorter = DistanceComparator(query_point);
std::vector<Point> range_query_result((*curr_page_iter).page_data_.begin(),(*curr_page_iter).page_data_.end());
std::sort(range_query_result.begin(),range_query_result.end(),distance_sorter);
if(range_query_result.size()>=k){
dist = distance_sorter.Distance(range_query_result[k-1]);
if (dist < (*curr_page_iter).DistanceToEdge(query_point)){
queryResult.assign(range_query_result.begin(),range_query_result.begin()+k);
return queryResult;
}
dist*=2;
}
else
dist = sqrt((k*4.0)/(num_datapoints_*density*pi));
do{
std::pair<Point,Point> range_query = std::make_pair(
Point(std::max(0.0,query_point.x_-dist), std::max(0.0,query_point.y_-dist)),
Point(std::min(1.0,query_point.x_+dist), std::min(1.0,query_point.y_+dist)));
range_query_result = RangeQuery(range_query);
if(range_query_result.size()<k){
dist = dist*2;
continue;
}
std::sort(range_query_result.begin(),range_query_result.end(),distance_sorter);
if(distance_sorter.Distance(range_query_result[k-1])<dist)
queryResult.assign(range_query_result.begin(),range_query_result.begin()+k);
else
dist = distance_sorter.Distance(range_query_result[k-1])*2;
}while(queryResult.size()<k);
return queryResult;
}
void InsertElement(Point new_point){
/* Find the node where new point is to be inserted. */
std::pair<uint32_t,uint32_t> col_page_id = ColIdPageIdForPoint(new_point);
// std::cout<<" InsertElement col_page_id:"<<col_page_id.first<<" "<<col_page_id.second<<std::endl;
uint32_t col_ix = col_page_id.first, page_ix = col_page_id.second;
// std::cout<<" InsertElement col_ix:"<<col_ix<<std::endl;
std::list<Page>::iterator page_iter_of_p = page_iters_[col_ix][page_ix];
// std::cout<<" InsertElement Page:"<<(*page_iter_of_p).low_.x_<<" "<<(*page_iter_of_p).low_.y_<<" "<<(*page_iter_of_p).high_.x_<<" "<<(*page_iter_of_p).high_.y_<<" "<<std::endl;
(*page_iter_of_p).page_data_.push_back(new_point);
(*page_iter_of_p).low_.x_ = std::min((*page_iter_of_p).low_.x_,new_point.x_);
(*page_iter_of_p).low_.y_ = std::min((*page_iter_of_p).low_.y_,new_point.y_);
(*page_iter_of_p).high_.x_ = std::max((*page_iter_of_p).high_.x_,new_point.x_);
(*page_iter_of_p).high_.y_ = std::max((*page_iter_of_p).high_.y_,new_point.y_);
if(grid_orientation_)
page_split_[col_ix][page_ix] = (*page_iter_of_p).high_.x_;
else
page_split_[col_ix][page_ix] = (*page_iter_of_p).high_.y_;
// std::cout<<" InsertElement pushed back"<<std::endl;
// std::getchar();
if((*page_iter_of_p).page_data_.size() > page_size_){
std::list<Page>::iterator next_page_iter = std::next(page_iter_of_p);
std::list<Page>::iterator new_page_iter;
std::vector<Point> curr_page_data((*page_iter_of_p).page_data_.begin(),(*page_iter_of_p).page_data_.end());
uint32_t curr_page_size = curr_page_data.size();
// Since we are now splitting inside the specific column, we have opposite sort orders to the original methods.
if(grid_orientation_){
std::sort(curr_page_data.begin(),curr_page_data.end(),x_sort_order);
new_page_iter = page_array_.insert(next_page_iter,Page(curr_page_data[size_t((curr_page_size+1)/2)].x_,(*page_iter_of_p).low_.y_,curr_page_data[curr_page_size-1].x_,(*page_iter_of_p).high_.y_));
page_split_[col_ix].insert(page_split_[col_ix].begin()+page_ix+1,(*new_page_iter).low_.x_);
(*page_iter_of_p).high_.x_ = curr_page_data[size_t((curr_page_size+1)/2)-1].x_;
}
else{
std::sort(curr_page_data.begin(),curr_page_data.end(),y_sort_order);
new_page_iter = page_array_.insert(next_page_iter,Page((*page_iter_of_p).low_.x_,curr_page_data[size_t((curr_page_size+1)/2)].y_,(*page_iter_of_p).high_.x_,curr_page_data[curr_page_size-1].y_));
page_split_[col_ix].insert(page_split_[col_ix].begin()+page_ix+1,(*new_page_iter).low_.y_);
(*page_iter_of_p).high_.y_ = curr_page_data[size_t((curr_page_size+1)/2)-1].y_;
}
// Inserting iterator to new page into page_iters_[col_ix]
page_iters_[col_ix].insert(page_iters_[col_ix].begin()+page_ix+1,new_page_iter);
// Assigning appropriate data to the two pages
(*page_iter_of_p).page_data_.assign(curr_page_data.begin(),curr_page_data.begin()+size_t((curr_page_size+1)/2));
(*new_page_iter).page_data_.assign(curr_page_data.begin()+size_t((curr_page_size+1)/2),curr_page_data.end());
}
}
// *********************** Extract Metrics ******************
double_t TimeSpentScanningPages(){
return metric_time_spent_scanning_pages_;
}
// TODO:
size_t ModelSize(){
return grid_split_.size()*sizeof(double_t) + sizeof(std::pair<double_t,uint32_t>)*page_array_.size();
}
uint64_t NumElementsScanned(){
return metric_num_elements_scanned_;
}
uint64_t NumPagesAcessed(){
return metric_num_pages_accessed_;
}
void ClearMetric(){
metric_time_spent_scanning_pages_=0; // chrono::duration<nanosecond> to store amount of time spent scanning.
metric_num_pages_accessed_=0;
metric_num_elements_scanned_=0;
}
};
#endif
\ No newline at end of file
/**
* @file train_test_zindex.cpp
* @author Sachith (sachith.pai@helsinki.fi)
* @brief File to train and test the zindex.
* @version 0.1
* @date 2022-05-04
*
* @copyright Copyright (c) 2022
*
*/
#include<iostream>
#include <vector>
#include <algorithm>
#include <time.h>
#include <random>
#include <chrono>
#include <fstream>
#include <stdlib.h>
#include<set>
#include<map>
#include<string>
#include"quilts.h"
#include "../toml11-master/toml.hpp"
#include "../json.hpp"
using namespace std;
using json = nlohmann::json; // using this to dump various logs.
const string data_path = "../Datasets/";
#define pddii pair<pair<double_t,double_t>,pair<uint32_t,uint32_t>>
bool sortbyfirst(const pddii &a, const pddii &b){return (a.first.first < b.first.first);}
bool sortbysecond(const pddii &a, const pddii &b){return (a.first.second < b.first.second);}
void rankspaceprojection(vector<pddii> &arr){
sort(arr.begin(), arr.end(), sortbyfirst);
for(int i=0;i<arr.size();i++)
arr[i].second.first = i;
sort(arr.begin(), arr.end(), sortbysecond);
for(int i=0;i<arr.size();i++)
arr[i].second.second = i;
}
int main(int argc, char* argv[])
{
const auto data_folder = string(argv[2]);
const auto experiment_name = string(argv[3]);
const auto config = toml::parse(data_path+data_folder+"/Experiments/"+experiment_name+"/config/"+string(argv[1])+".toml");
const auto point_class = toml::find<std::string>(config, "point_class");
const auto point_file = toml::find<std::string>(config, "point_file");
const auto query_file = toml::find<std::string>(config, "query_file");
cout.precision(17);
vector<pddii> data_raw;
vector<double_t> x_values;
vector<double_t> y_values;
double_t a, b, c, d;
ifstream pointsfile(data_path+data_folder+"/DataPoints/"+point_class+"/"+point_file);
while ( pointsfile >> a >> b){
data_raw.push_back(make_pair(make_pair(a,b),make_pair(0,0)));
x_values.push_back(a);
y_values.push_back(b);
}
pointsfile.close();
rankspaceprojection(data_raw);
sort(x_values.begin(), x_values.end());
sort(y_values.begin(), y_values.end());
vector<Point> data;
for(int i=0;i<data_raw.size();i++){
data.push_back(Point(data_raw[i].second.first,data_raw[i].second.second));
}
uint32_t insert_increments = uint32_t(data.size()*0.1);
cout<<"Finishe reading data "<<data.size()<<endl;
vector<pair<Point,Point>> queries;
ifstream queriesfile(data_path+data_folder+"/Queries/RangeQueries/"+query_file);
while (queriesfile >> a >> b >> c >> d){
uint32_t rank_a = (lower_bound(x_values.begin(),x_values.end(),a)-x_values.begin());
uint32_t rank_b = (lower_bound(y_values.begin(),y_values.end(),b)-y_values.begin());
uint32_t rank_c = (lower_bound(x_values.begin(),x_values.end(),c)-x_values.begin());
uint32_t rank_d = (lower_bound(y_values.begin(),y_values.end(),d)-y_values.begin());
queries.push_back(make_pair(Point(rank_a,rank_b),Point(rank_c,rank_d)));
}
uint32_t insert_rq_size = uint32_t(queries.size()*0.05);
queriesfile.close();
int page_size = toml::find<std::int32_t>(config, "page_size");
cout<<"Finishe reading data n queries "<<data.size()<<" , "<<queries.size()<<"\n";
cout.flush();
auto quilts_train_start = std::chrono::high_resolution_clock::now();
Quilts quilts_obj = Quilts(data,queries,page_size);
auto quilts_train_end = std::chrono::high_resolution_clock::now();
cout<<"Finished training QUILTS"<<endl;
//############ QUILTS ############
{ //range query
json quilts_json;
quilts_json["model"]="QUILTS";
quilts_json["query_file"] = query_file;
quilts_json["point_class"] = point_class;
quilts_json["point_file"] = point_file;
quilts_json["build_time"] = chrono::duration_cast<chrono::seconds>(quilts_train_end - quilts_train_start).count();
quilts_json["config_id"] = string(argv[1]);
uint64_t result_size =0;
auto quilts_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: queries){
vector<Point> range_query_result = quilts_obj.RangeQuery(query);
result_size+=range_query_result.size();
}
auto quilts_eval_end = std::chrono::high_resolution_clock::now();
quilts_json["range_result_size"]=result_size;
quilts_json["page_count"]=quilts_obj.page_cnt_;
quilts_json["node_count"]=quilts_obj.node_cnt_;
quilts_json["index_size"]=quilts_obj.ModelSize();
quilts_json["range_query_time"] = chrono::duration_cast<chrono::nanoseconds>(quilts_eval_end - quilts_eval_start).count()/queries.size();
quilts_json["range_query_scantime"] = quilts_obj.TimeSpentScanningPages()/queries.size();
quilts_json["range_query_page_accessed"]=quilts_obj.NumPagesAcessed()/queries.size();
quilts_json["range_query_points_scanned"]=quilts_obj.NumElementsScanned()/queries.size();
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/Range.json",ios_base::app);
o << quilts_json << std::endl;
o.close();
}
cout<<"Finished Range query QUILTS"<<endl;
if(atoi(argv[4])){
vector<Point> knn_queries;
ifstream knn_queriesfile(data_path+data_folder+"/Queries/KnnQueries/"+point_class);
while (knn_queriesfile >> a >> b){
uint32_t rank_a = (lower_bound(x_values.begin(),x_values.end(),a)-x_values.begin());
uint32_t rank_b = (lower_bound(y_values.begin(),y_values.end(),b)-y_values.begin());
knn_queries.push_back(Point(rank_a,rank_b));
}
knn_queriesfile.close();
vector<uint32_t> k_values = toml::find<std::vector<uint32_t>>(config, "knn_k_values");
cout<<"Finished Reading KNN QUILTS"<<endl;
vector<Point> point_queries;
ifstream point_queriesfile(data_path+data_folder+"/Queries/PointQueries/"+point_class);
while (point_queriesfile >> a >> b){
uint32_t rank_a = (lower_bound(x_values.begin(),x_values.end(),a)-x_values.begin());
uint32_t rank_b = (lower_bound(y_values.begin(),y_values.end(),b)-y_values.begin());
point_queries.push_back(Point(rank_a,rank_b));
}
point_queriesfile.close();
cout<<"Finished reading Point QUILTS"<<endl;
vector<Point> insert_queries;
ifstream insert_queriesfile(data_path+data_folder+"/Queries/InsertQueries/"+point_class);
while (insert_queriesfile >> a >> b){
uint32_t rank_a = (lower_bound(x_values.begin(),x_values.end(),a)-x_values.begin());
uint32_t rank_b = (lower_bound(y_values.begin(),y_values.end(),b)-y_values.begin());
insert_queries.push_back(Point(rank_a,rank_b));
}
insert_queriesfile.close();
cout<<"Finished reading insert QUILTS"<<endl;
cout<<"Starting Knn"<<endl;
{ // KNN querys
json quilts_json;
quilts_json["model"]="QUILTS";
quilts_json["point_class"] = point_class;
quilts_json["point_file"] = point_file;
quilts_json["config_id"] = string(argv[1]);
vector<Point> knn_query_result;
for(auto &k : k_values){
quilts_json["k"] = k;
quilts_obj.ClearMetric();
auto quilts_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: knn_queries){
// cout<<"KNN query "<<query.x_<<" "<<query.y_<<endl;
knn_query_result = quilts_obj.KNNQuery(query,k);
}
auto quilts_eval_end = std::chrono::high_resolution_clock::now();
quilts_json["knn_query_time"]=chrono::duration_cast<chrono::nanoseconds>(quilts_eval_end - quilts_eval_start).count()/knn_queries.size();
quilts_json["knn_query_scantime"] = quilts_obj.TimeSpentScanningPages()/knn_queries.size();
quilts_json["knn_query_page_accessed"]=quilts_obj.NumPagesAcessed()/knn_queries.size();
quilts_json["knn_query_points_scanned"]=quilts_obj.NumElementsScanned()/knn_queries.size();
std::cout<<"QUILTS knn k:"<<k<<std::endl;
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/KNN.json",ios_base::app);
o << quilts_json << std::endl;
o.close();
}
}
cout<<"KNN DOne"<<endl;
{ // Point queries
json quilts_json;
quilts_json["model"]="QUILTS";
quilts_json["point_class"] = point_class;
quilts_json["point_file"] = point_file;
quilts_json["config_id"] = string(argv[1]);
bool point_query_result;
auto quilts_eval_start = std::chrono::high_resolution_clock::now();
for(auto &query: point_queries){
point_query_result = quilts_obj.PointQuery(query);
}
auto quilts_eval_end = std::chrono::high_resolution_clock::now();
quilts_json["point_query_time"]=chrono::duration_cast<chrono::nanoseconds>(quilts_eval_end - quilts_eval_start).count()/point_queries.size();
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/Point.json",ios_base::app);
o << quilts_json << std::endl;
o.close();
}
cout<<"Point Queries DOne"<<endl;
{ // Insert queries
json quilts_json;
quilts_json["model"]="QUILTS";
quilts_json["point_class"] = point_class;
quilts_json["point_file"] = point_file;
quilts_json["config_id"] = string(argv[1]);
vector<uint64_t> insert_times;
vector<uint64_t> range_query_times;
uint32_t ins_ix=0;
for(int ins_epoch =0;ins_epoch<5;ins_epoch++){
auto quilts_eval_start = std::chrono::high_resolution_clock::now();
for(int j=0;j<insert_increments;j++,ins_ix++){
quilts_obj.InsertElement(insert_queries[ins_ix]);
}
auto quilts_eval_end = std::chrono::high_resolution_clock::now();
insert_times.push_back(chrono::duration_cast<chrono::nanoseconds>(quilts_eval_end - quilts_eval_start).count()/insert_increments);
auto quilts_rq_eval_start = std::chrono::high_resolution_clock::now();
for(int i=0;i<insert_rq_size;i++){
vector<Point> range_query_result = quilts_obj.RangeQuery(queries[i]);
}
auto quilts_rq_eval_end = std::chrono::high_resolution_clock::now();
range_query_times.push_back(chrono::duration_cast<chrono::nanoseconds>(quilts_rq_eval_end - quilts_rq_eval_start).count()/insert_rq_size);
}
quilts_json["insert_query_time"]= insert_times;
quilts_json["range_query_times"]= range_query_times;
ofstream o(data_path+data_folder+"/Experiments/"+experiment_name+"/result/Insert.json",ios_base::app);
o << quilts_json << std::endl;
o.close();
}
cout<<"Insert DOne"<<endl;
}
cout<<"QUILTS DONE\n";
cout.flush();
return 0;
}
\ No newline at end of file
/**
* @file quilts.h
* @author Sachith (sachith.pai@helsinki.fi)
* @brief The implementation of QUILTS that involves the using a UB tree with custom
* bit-interleaving method.
* @version 0.1
* @date 2022-09-26
*
* @copyright Copyright (c) 2022
*
*/
#ifndef QUILTS_H
#define QUILTS_H
#include<cstdint>
#include<list>
#include<cmath>
#include<iostream>
#include<algorithm>
#include<vector>
#include<tuple>
#include <cassert>
#include<string>
class Point{
public:
uint32_t x_,y_;
Point(uint32_t x, uint32_t y):x_(x),y_(y){}
};
/*
* @brief A comparator class for sorting points according to distance for KNN queries.
*/
class DistanceComparator {
public:
Point orig;
DistanceComparator(const Point &point):orig(point){ }
bool operator()(const Point& a,const Point& b)
{
double_t distance_a = (orig.x_-a.x_)*(orig.x_-a.x_) + (orig.y_-a.y_)*(orig.y_-a.y_);
double_t distance_b = (orig.x_-b.x_)*(orig.x_-b.x_) + (orig.y_-b.y_)*(orig.y_-b.y_);
return (distance_a<distance_b);
}
double_t Distance(const Point& a)
{
return sqrt((orig.x_-a.x_)*(orig.x_-a.x_) + (orig.y_-a.y_)*(orig.y_-a.y_));
}
};
class BplusTreeNode{
public:
std::vector<uint64_t> Z_ranges_; // Will store C-1 ranges if we have C children
std::vector<BplusTreeNode*> child_; // array to store C (in range 1-4) children
bool is_leaf_;
uint32_t num_children_;
std::vector<Point> page_data_;
uint32_t node_lowx_,node_lowy_,node_highx_,node_highy_;
BplusTreeNode(){
num_children_=0;
is_leaf_=false;
node_lowx_ = std::numeric_limits<uint32_t>::max();
node_lowy_ = std::numeric_limits<uint32_t>::max();
node_highx_ = std::numeric_limits<uint32_t>::min();
node_highy_ = std::numeric_limits<uint32_t>::min();
}
uint64_t Area(){
return (node_highx_-node_lowx_+1)*(node_highy_-node_lowy_+1);
}
uint64_t DistanceToEdge(const Point& other){
return std::min( std::min(other.x_-node_lowx_,other.y_-node_lowy_),
std::min(node_highx_-other.x_,node_highy_-other.y_));
}
};
/**
* @brief A comparator to sort based on the uint64_t for mapped_data_points.
*
*/
struct less_than_key
{
inline bool operator() (const std::pair<uint64_t,Point>& obj1, const std::pair<uint64_t,Point>& obj2)
{
return (obj1.first < obj2.first);
}
};
class Quilts{
public:
uint32_t bitmask_lx_,bitmask_ly_,bitmask_ux_,bitmask_uy_;
uint32_t page_size_;
uint32_t num_datapoints_;
BplusTreeNode* root_;
// *METRICS*
uint64_t metric_time_spent_scanning_pages_{}; // chrono::duration to store amount of time spent scanning.
uint64_t metric_num_pages_accessed_{};
uint64_t metric_num_elements_scanned_{};
uint32_t node_cnt_{};
uint32_t page_cnt_{};
Quilts(std::vector<Point> &datapoints, std::vector<std::pair<Point,Point>> &queries,uint32_t page_size):page_size_(page_size){
num_datapoints_ = datapoints.size();
ExtractQueryRelevantBits(queries);
std::vector<std::pair<uint64_t,Point>> mapped_data_points;
for(auto &d: datapoints)
mapped_data_points.push_back(std::make_pair(MapZValue(d),d));
std::sort(mapped_data_points.begin(),mapped_data_points.end(),less_than_key());
root_ = new BplusTreeNode();
node_cnt_++;
BulkLoad(mapped_data_points);
}
void BulkLoad(std::vector<std::pair<uint64_t,Point>> &dataset){
uint32_t effective_page_size = uint32_t(0.7 *page_size_); // Adding gaps for inserts
for(int i=0;i<dataset.size();i+=effective_page_size){
BplusTreeNode *page_node = new BplusTreeNode();
node_cnt_++;
page_node->is_leaf_ = true;
page_node->Z_ranges_.push_back(dataset[i].first);
uint32_t last_elem_ix = std::min((i+effective_page_size-1),uint32_t(dataset.size()-1));
page_node->Z_ranges_.push_back(dataset[last_elem_ix].first);
for(int j=0;j<effective_page_size && (i+j)<dataset.size();j++){
page_node->page_data_.push_back(dataset[i+j].second);
page_node->node_lowx_ = std::min(page_node->node_lowx_,dataset[i+j].second.x_);
page_node->node_lowy_ = std::min(page_node->node_lowy_,dataset[i+j].second.y_);
page_node->node_highx_ = std::max(page_node->node_highx_,dataset[i+j].second.x_);
page_node->node_highy_ = std::max(page_node->node_highy_,dataset[i+j].second.y_);
}
page_cnt_++;
if(i==0){
root_->child_.push_back(page_node);
root_->num_children_++;
continue;
}
InsertPageEnd(root_,page_node);
// Btree node splitting logic.
if(root_->num_children_>4){
assert(root_->num_children_== 5);
BplusTreeNode *new_node = new BplusTreeNode();
node_cnt_++;
BplusTreeNode *old_root_node = root_;
new_node->Z_ranges_.push_back(old_root_node->Z_ranges_[3]);
old_root_node->Z_ranges_.pop_back();
new_node->child_.push_back(old_root_node->child_[3]);
new_node->child_.push_back(old_root_node->child_[4]);
new_node->num_children_+=2;
old_root_node->child_.pop_back();
old_root_node->child_.pop_back();
old_root_node->num_children_-=2;
root_= new BplusTreeNode();
node_cnt_++;
root_->child_.push_back(old_root_node);
root_->Z_ranges_.push_back(old_root_node->Z_ranges_[2]);
old_root_node->Z_ranges_.pop_back();
root_->child_.push_back(new_node);
root_->num_children_+=2;
}
}
UpdateBoundingBoxesForInternalNodes(root_);
}
void PrintTree(BplusTreeNode* curr_node){
if(curr_node->is_leaf_){
std::cout<<"LEAF\t Z-ranges:"<<curr_node->Z_ranges_[0]<<" - "<<curr_node->Z_ranges_[1];
std::cout<<"\t Bounding Box: ("<<curr_node->node_lowx_<<","<<curr_node->node_lowy_<<") ("<<curr_node->node_highx_<<","<<curr_node->node_highy_<<")"<<std::endl;
return;
}
std::cout<<"Node\t #:"<<(curr_node->num_children_);
for(int i=0;i<curr_node->num_children_;i++){
std::cout<<"\t ["<<curr_node->child_[i]->Z_ranges_[0]<<"] ";
if(i<curr_node->num_children_-1)
std::cout<<curr_node->Z_ranges_[i];
}
std::cout<<std::endl;
std::cout<<"\t Bounding box: ("<<curr_node->node_lowx_<<","<<curr_node->node_lowy_<<") ("<<curr_node->node_highx_<<","<<curr_node->node_highy_<<")"<<std::endl;
for(int i=0;i<curr_node->num_children_;i++)
PrintTree(curr_node->child_[i]);
}
void InsertPageEnd(BplusTreeNode *curr_node,BplusTreeNode *page_node){
// std::cout<<"Inside InsertPageEnd"<<std::endl;
BplusTreeNode *rightmost_child = curr_node->child_[curr_node->num_children_-1];
if(rightmost_child->is_leaf_){ // insert at current node
// std::cout<<"Found node to insert page"<<std::endl;
curr_node->Z_ranges_.push_back(page_node->Z_ranges_[0]);
curr_node->child_.push_back(page_node);
curr_node->num_children_++;
return;
}
InsertPageEnd(rightmost_child,page_node);
if(rightmost_child->num_children_>4){
assert(rightmost_child->num_children_== 5);
BplusTreeNode *new_node = new BplusTreeNode();
node_cnt_++;
new_node->Z_ranges_.push_back(rightmost_child->Z_ranges_[3]);
rightmost_child->Z_ranges_.pop_back();
new_node->child_.push_back(rightmost_child->child_[3]);
new_node->child_.push_back(rightmost_child->child_[4]);
new_node->num_children_+=2;
rightmost_child->child_.pop_back();
rightmost_child->child_.pop_back();
rightmost_child->num_children_-=2;
curr_node->Z_ranges_.push_back(rightmost_child->Z_ranges_[2]);
rightmost_child->Z_ranges_.pop_back();
curr_node->child_.push_back(new_node);
curr_node->num_children_++;
}
}
void ExtractQueryRelevantBits(std::vector<std::pair<Point,Point>> &queries){
bitmask_lx_ = 32;
bitmask_ly_ = 32;
bitmask_ux_ = 0;
bitmask_uy_ = 0;
for(auto& q: queries){
uint32_t bit_x = uint32_t(ceil(log(q.second.x_-q.first.x_)/log(2)));
uint32_t bit_y = uint32_t(ceil(log(q.second.y_-q.first.y_)/log(2)));
bitmask_lx_ = std::min(bitmask_lx_,bit_x);
bitmask_ly_ = std::min(bitmask_ly_,bit_y);
bitmask_ux_ = std::max(bitmask_ux_,bit_x);
bitmask_uy_ = std::max(bitmask_uy_,bit_y);
}
}
uint64_t MapZValue(Point p){
uint64_t mapped_value=0;
uint32_t ix_x=0,ix_y=0;
uint32_t a=bitmask_lx_,b=bitmask_ly_;
uint32_t temp =0;
while(a>0 || b>0){
if(b<=a){
mapped_value |= (p.x_ & (1L<<(ix_x)))<<ix_y;
ix_x++;
a--;
}
else {
mapped_value |= (p.y_ & (1L<<(ix_y)))<<ix_x;
ix_y++;
b--;
}
}
a=bitmask_ux_-bitmask_lx_;
b=bitmask_uy_-bitmask_ly_;
while(a>0 || b>0){
if(b<=a){
mapped_value |= (p.x_ & (1L<<(ix_x)))<<ix_y;
ix_x++;
a--;
}
else {
mapped_value |= (p.y_ & (1L<<(ix_y)))<<ix_x;
ix_y++;
b--;
}
}
a=32-bitmask_ux_;
b=32-bitmask_uy_;
while(a>0 || b>0){
if(b<=a){
mapped_value |= (p.x_ & (1L<<(ix_x)))<<ix_y;
ix_x++;
a--;
}
else {
mapped_value |= (p.y_ & (1L<<(ix_y)))<<ix_x;
ix_y++;
b--;
}
}
return mapped_value;
}
void UpdateBoundingBoxesForInternalNodes(BplusTreeNode *curr_node){
if(curr_node->is_leaf_)
return;
for(int i=0;i<curr_node->num_children_;i++){
UpdateBoundingBoxesForInternalNodes(curr_node->child_[i]);
curr_node->node_lowx_ = std::min(curr_node->node_lowx_,curr_node->child_[i]->node_lowx_);
curr_node->node_lowy_ = std::min(curr_node->node_lowy_,curr_node->child_[i]->node_lowy_);
curr_node->node_highx_ = std::max(curr_node->node_highx_,curr_node->child_[i]->node_highx_);
curr_node->node_highy_ = std::max(curr_node->node_highy_,curr_node->child_[i]->node_highy_);
}
}
std::vector<Point> RangeQuery(std::pair<Point,Point> query){
uint64_t query_lowerbound_Z = MapZValue(query.first);
uint64_t query_upperbound_Z = MapZValue(query.second);
// std::cout<<"query_lowerbound_Z :"<<query_lowerbound_Z<<" query_upperbound_Z"<<query_upperbound_Z<<std::endl;
std::vector<Point> query_results;
RangeQueryHelper(root_,query_lowerbound_Z,query_upperbound_Z,query,query_results);
return query_results;
}
void RangeQueryHelper(BplusTreeNode *curr_node, uint64_t query_lowerbound_Z, uint64_t query_upperbound_Z, std::pair<Point,Point> query, std::vector<Point> &query_results){
// IF its a leaf node just filter and return points.
if(curr_node->is_leaf_){
metric_num_elements_scanned_+=curr_node->page_data_.size();
metric_num_pages_accessed_ ++;
auto page_scan_phase_start = std::chrono::high_resolution_clock::now();
for(Point &p: curr_node->page_data_ ){
if(p.x_ >= query.first.x_ && p.x_ <= query.second.x_ && p.y_ >= query.first.y_ && p.y_ <= query.second.y_){
query_results.push_back(p);
}
}
auto page_scan_phase_end = std::chrono::high_resolution_clock::now();
metric_time_spent_scanning_pages_+=std::chrono::duration_cast<std::chrono::nanoseconds>(page_scan_phase_end-page_scan_phase_start).count();
return;
}
uint32_t valid_child_ix_lowerbound = 0;
while( valid_child_ix_lowerbound<curr_node->num_children_-1 &&
query_lowerbound_Z >= curr_node->Z_ranges_[valid_child_ix_lowerbound]){
valid_child_ix_lowerbound++;
}
uint32_t valid_child_ix_upperbound = curr_node->num_children_-1;
while( valid_child_ix_upperbound>0 &&
query_upperbound_Z < curr_node->Z_ranges_[valid_child_ix_upperbound-1]){
valid_child_ix_upperbound--;
}
// std::cout<<"Curr_node:"<<curr_node->node_lowx_<<" "<<curr_node->node_lowy_<<" "<<curr_node->node_highx_<<" "<<curr_node->node_highy_<<std::endl;
// std::cout<<" valid_child_ix_lowerbound:"<<valid_child_ix_lowerbound<<" valid_child_ix_upperbound:"<<valid_child_ix_upperbound<<std::endl;
for(uint32_t ch_ix = valid_child_ix_lowerbound; ch_ix<=valid_child_ix_upperbound; ch_ix++)
{
if((std::max(curr_node->child_[ch_ix]->node_lowx_,query.first.x_)<std::min(curr_node->child_[ch_ix]->node_highx_,query.second.x_) and std::max(curr_node->child_[ch_ix]->node_lowy_,query.first.y_)<std::min(curr_node->child_[ch_ix]->node_highy_,query.second.y_)))
RangeQueryHelper(curr_node->child_[ch_ix],query_lowerbound_Z,query_upperbound_Z,query,query_results);
}
}
bool PointQuery(Point query_point){
uint64_t p_Z = MapZValue(query_point);
BplusTreeNode *curr_node = LeafNodeForPoint(root_, p_Z, query_point);
for(Point &p: curr_node->page_data_ ){
if(p.x_ == query_point.x_ && p.y_ <= query_point.y_ )
return true;
}
return false;
}
BplusTreeNode* LeafNodeForPoint(BplusTreeNode *curr_node, uint64_t p_Z, Point query_point){
if(curr_node->is_leaf_)
return curr_node;
uint32_t valid_child_ix = 0;
while( valid_child_ix<curr_node->num_children_-1 &&
p_Z >= curr_node->Z_ranges_[valid_child_ix]){
valid_child_ix++;
}
return LeafNodeForPoint(curr_node->child_[valid_child_ix],p_Z,query_point);
}
std::vector<Point> KNNQuery(Point query_point,uint32_t k){
std::vector<Point> queryResult;
uint64_t p_Z = MapZValue(query_point);
BplusTreeNode* leaf_node_of_p = LeafNodeForPoint(root_, p_Z, query_point);
uint64_t area = leaf_node_of_p->Area();
uint32_t num_elems = leaf_node_of_p->page_data_.size();
double_t density = num_elems*1.0/area;
double_t pi = 2 * acos(0.0);
uint32_t dist; // = uint32_t(sqrt((k*8.0)/(density*pi))+1);
DistanceComparator distance_sorter = DistanceComparator(query_point);
std::vector<Point> range_query_result(leaf_node_of_p->page_data_.begin(),leaf_node_of_p->page_data_.end());
std::sort(range_query_result.begin(),range_query_result.end(),distance_sorter);
if(range_query_result.size()>=k){
dist = uint32_t(distance_sorter.Distance(range_query_result[k-1]));
if (dist < leaf_node_of_p->DistanceToEdge(query_point)){
queryResult.assign(range_query_result.begin(),range_query_result.begin()+k);
return queryResult;
}
dist=dist*2+1;
}
else
dist = uint32_t(sqrt((k*4.0)/(density*pi))+1);
do{
/*
the values sometimes loop around due to the unsigned ints. So the following lines bounds the range properly.
*/
uint32_t rq_low_x = std::max(root_->node_lowx_,query_point.x_-std::min(query_point.x_,dist));
uint32_t rq_low_y = std::max(root_->node_lowy_,query_point.y_-std::min(query_point.y_,dist));
uint32_t rq_high_x = std::min(root_->node_highx_,query_point.x_+std::min(uint32_t(1073741824-query_point.x_),dist));
uint32_t rq_high_y = std::min(root_->node_highy_,query_point.y_+std::min(uint32_t(1073741824-query_point.y_),dist));
std::pair<Point,Point> range_query = std::make_pair(
Point(rq_low_x, rq_low_y),
Point(rq_high_x, rq_high_y));
range_query_result = RangeQuery(range_query);
if(range_query_result.size()<k){
assert(dist < 1073741824);
dist = dist*2;
continue;
}
std::sort(range_query_result.begin(),range_query_result.end(),distance_sorter);
if(distance_sorter.Distance(range_query_result[k-1])<dist)
queryResult.assign(range_query_result.begin(),range_query_result.begin()+k);
else
dist = uint32_t(distance_sorter.Distance(range_query_result[k-1]))*2;
}while(queryResult.size()<k);
return queryResult;
}
void InsertElement(Point query_point){
uint64_t p_Z = MapZValue(query_point);
BplusTreeNode *curr_node = LeafNodeForPoint(root_, p_Z, query_point);
curr_node->page_data_.push_back(query_point);
// If the current page overflows then just split it into 4.
if(curr_node->page_data_.size()>page_size_){
std::vector<std::pair<uint64_t,Point>> mapped_data_points;
for(auto &d: curr_node->page_data_)
mapped_data_points.push_back(std::make_pair(MapZValue(d),d));
std::sort(mapped_data_points.begin(),mapped_data_points.end(),less_than_key());
curr_node->page_data_.clear();
curr_node->is_leaf_=false;
curr_node->node_lowx_ = std::numeric_limits<uint32_t>::max();
curr_node->node_lowy_ = std::numeric_limits<uint32_t>::max();
curr_node->node_highx_ = std::numeric_limits<uint32_t>::min();
curr_node->node_highy_ = std::numeric_limits<uint32_t>::min();
curr_node->Z_ranges_.clear();
uint32_t num_elements = mapped_data_points.size();
uint32_t new_leaf_size = num_elements/4, num_leftover_points = num_elements%4;
for(int i=0;i<4;i++){
BplusTreeNode *page_node = new BplusTreeNode();
node_cnt_++;
page_node->is_leaf_ = true;
uint32_t first_elem_ix = i*new_leaf_size;
uint32_t last_elem_ix = (i+1)*new_leaf_size + (i==0)*num_leftover_points-1;
page_node->Z_ranges_.push_back(mapped_data_points[first_elem_ix].first);
page_node->Z_ranges_.push_back(mapped_data_points[last_elem_ix].first);
for(int j=first_elem_ix;j<=last_elem_ix && j<mapped_data_points.size();j++){
page_node->page_data_.push_back(mapped_data_points[j].second);
page_node->node_lowx_ = std::min(page_node->node_lowx_,mapped_data_points[j].second.x_);
page_node->node_lowy_ = std::min(page_node->node_lowy_,mapped_data_points[j].second.y_);
page_node->node_highx_ = std::max(page_node->node_highx_,mapped_data_points[j].second.x_);
page_node->node_highy_ = std::max(page_node->node_highy_,mapped_data_points[j].second.y_);
}
if(i>0)
curr_node->Z_ranges_.push_back(page_node->Z_ranges_[0]);
curr_node->child_.push_back(page_node);
curr_node->num_children_ ++;
}
UpdateBoundingBoxesForInternalNodes(root_);
}
}
// *********************** Extract Metrics ******************
double_t TimeSpentScanningPages(){
return metric_time_spent_scanning_pages_;
}
/**
* @brief size is calculated as number of nodes into size + the memory occupied by Zranges at each node.
* Total num of uint64_t Zrange values within the tree is always 3 times the page_cnt_.
* 2 per page and one corresponding value in
*/
size_t ModelSize(){
return sizeof(BplusTreeNode)*node_cnt_ + page_cnt_ * 3 * sizeof(uint64_t);
}
uint64_t NumElementsScanned(){
return metric_num_elements_scanned_;
}
uint64_t NumPagesAcessed(){
return
metric_num_pages_accessed_;
}
void ClearMetric(){
metric_time_spent_scanning_pages_=0; // chrono::duration<nanosecond> to store amount of time spent scanning.
metric_num_pages_accessed_=0;
metric_num_elements_scanned_=0;
}
};
#endif
// #9466616920358864793
// #8998070210664679321
\ No newline at end of file