Skip to content
Snippets Groups Projects
Commit 28239866 authored by Nikolaj Tatti's avatar Nikolaj Tatti
Browse files

splits for real-valued data

parent 5135a0b1
No related branches found
No related tags found
No related merge requests found
SRCS = realsplit.cpp real.cpp
OBJS = $(SRCS:.cpp=.o)
CC = g++
LDFLAGS= -lm -O3
CFLAGS= -g -O3 -MMD -Wall -Wextra
all : realsplit
realsplit: $(OBJS)
$(CC) $(LDFLAGS) -o $@ $(OBJS)
%.o : %.cpp
$(CC) $(CFLAGS) -c $*.cpp
clean:
-rm *.o realsplit
zip:
zip realsplit.zip *.cpp *.h Makefile
-include $(SRCS:%.cpp=%.d)
#include "real.h"
void
attribute::init(double approx, uint32_t classcnt)
{
m_cnt = 0;
m_approx = approx;
m_candidates.push_back(candidate());
m_candidates.push_back(candidate());
candidate &front = m_candidates.front();
candidate &back = m_candidates.back();
front.init(classcnt);
back.init(classcnt);
front.threshold = m_values.insert(std::make_pair(KEYMIN, labellist())).first;
back.threshold = m_values.insert(std::make_pair(KEYMAX, labellist())).first;
}
void
attribute::add(double x, uint32_t k)
{
m_cnt++;
m_values[x].push_back(k);
for (candlist::iterator it = m_candidates.begin(); it != m_candidates.end(); ++it) {
if (x <= it->threshold->first)
it->ent1.change(k, 1);
else
it->ent2.change(k, 1);
}
candlist::iterator base = m_candidates.begin();
candlist::iterator cand = base;
for (++cand; cand != m_candidates.end(); ++cand) {
while (true) {
if (cand->ent1.value() <= (1 + m_approx)*base->ent1.value()) break;
attrmap::iterator t = cand->threshold;
--t;
if (t == base->threshold) break;
if (cand->threshold->first == KEYMAX) {
m_candidates.push_back(*cand);
}
cand->step();
}
++base;
}
}
double
attribute::optimal_split(double & threshold)
{
double score = std::numeric_limits<double>::infinity();
for (candlist::iterator it = m_candidates.begin(); it != m_candidates.end(); ++it) {
double cand = it->value();
if (cand < score) {
score = cand;
threshold = it->threshold->first;
}
}
return score;
}
#ifndef DTREAL_H
#define DTREAL_H
#include <stdint.h>
#include <vector>
#include <map>
#include <list>
#include <limits>
#include <math.h>
typedef std::vector<uint32_t> labelvector;
typedef std::list<uint32_t> labellist;
typedef std::map<double, labellist> attrmap;
const static double KEYMAX = std::numeric_limits<double>::infinity();
const static double KEYMIN = -std::numeric_limits<double>::infinity();
struct entropy
{
labelvector lab;
uint32_t n;
double v;
void
init(uint32_t k)
{
lab = labelvector(k);
n = 0;
v = 0;
}
void
change(uint32_t k, int32_t c)
{
if (lab[k] > 0) v += lab[k]*log(lab[k]);
n += c;
lab[k] += c;
if (lab[k] > 0) v -= lab[k]*log(lab[k]);
}
double value() const {return n > 0 ? v + n*log(n) : 0;}
};
struct candidate {
entropy ent1, ent2;
attrmap::iterator threshold;
void
init(uint32_t k)
{
ent1.init(k);
ent2.init(k);
}
double value() {return ent1.value() + ent2.value();}
void step()
{
labellist & labels = threshold->second;
for (labellist::iterator it = labels.begin(); it != labels.end(); ++it) {
ent1.change(*it, -1);
ent2.change(*it, 1);
}
--threshold;
}
};
typedef std::list<candidate> candlist;
class attribute {
public:
void init(double approx, uint32_t classcnt);
void add(double x, uint32_t k);
double optimal_split(double & threshold);
uint32_t candcnt() const {return m_candidates.size();}
uint32_t size() const {return m_cnt;}
protected:
double m_approx;
candlist m_candidates;
attrmap m_values;
uint32_t m_cnt;
};
typedef std::vector<attribute> attributevector;
#endif
#include "real.h"
#include "top.h"
#include <getopt.h>
#include <chrono>
typedef std::vector<double> doublevector;
typedef std::vector<doublevector> doublematrix;
typedef topresult<std::pair<uint32_t, double> > realtop;
struct node {
node(double a, uint32_t dim, uint32_t ccnt): attrs(dim), counts(ccnt), pos(0), neg(0)
{
for (uint32_t i = 0; i < dim; i++) attrs[i].init(a, ccnt);
}
attributevector attrs;
labelvector counts;
uint32_t cond;
double thresh;
node *pos, *neg;
};
node *
construct_tree(const doublematrix & d, const labelvector & labels, double approx, double prob, uint32_t classcnt, uint32_t nmax)
{
uint32_t dim = d[0].size();
node *root = new node(approx, dim, classcnt);
for (uint32_t i = 0; i < labels.size(); i++) {
node *cur = root;
while (cur->pos) {
if (d[i][cur->cond] <= cur->thresh)
cur = cur->pos;
else
cur = cur->neg;
}
realtop t;
for (uint32_t j = 0; j < dim; j++) {
cur->attrs[j].add(d[i][j], labels[i]);
double threshold;
double ent = cur->attrs[j].optimal_split(threshold);
t.test(ent, std::make_pair(j, threshold));
}
uint32_t n = cur->attrs[0].size();
//double threshold = (6*(2*(1 + log(n)) + log(2) + log(n)) + 2*log(2))*sqrt(log(1 / prob) / (2 * n));
double threshold = log(2)*sqrt(log(1 / prob) / (2 * n));
if (t.s2 - t.s1 >= threshold * n || (nmax > 0 && n >= nmax)) {
printf("split %d %f %d %d %f\n", n, t.s2 - t.s1, i, t.i1.first, t.i1.second);
cur->cond = t.i1.first;
cur->thresh = t.i1.second;
cur->pos = new node(approx, dim, classcnt);
cur->neg = new node(approx, dim, classcnt);
}
}
return root;
}
double
entropy(node *cur)
{
if (cur->pos == 0) {
double ent = 0;
uint32_t n = 0;
for (uint32_t i = 0; i < cur->counts.size(); i++) {
if (cur->counts[i] == 0) continue;
ent -= cur->counts[i]*log(cur->counts[i]);
n += cur->counts[i];
}
if (n > 0) ent += n*log(n);
return ent;
}
else
return entropy(cur->neg) + entropy(cur->pos);
}
void
set_counters(const doublematrix & d, const labelvector & labels, node *root)
{
for (uint32_t i = 0; i < labels.size(); i++) {
node *cur = root;
while (cur->pos) {
if (d[i][cur->cond] <= cur->thresh)
cur = cur->pos;
else
cur = cur->neg;
}
cur->counts[labels[i]]++;
}
}
uint32_t
read_real(FILE *f, doublematrix & d, labelvector & labels)
{
uint32_t cnt, dim, classcnt;
fscanf(f, "%d %d", &cnt, &dim);
d.resize(cnt);
labels.resize(cnt);
classcnt = 0;
for (uint32_t i = 0; i < cnt; i++) {
uint32_t label;
fscanf(f, "%d", &label);
labels[i] = label;
d[i].resize(dim);
if (label > classcnt) classcnt = label;
for (uint32_t j = 0; j < dim; j++) {
double v;
fscanf(f, "%lf", &v);
d[i][j] = v;
}
}
return classcnt + 1;
}
int
main(int argc, char **argv)
{
static struct option longopts[] = {
{"approx", required_argument, NULL, 'a'},
{"out", required_argument, NULL, 'o'},
{"in", required_argument, NULL, 'i'},
{"help", no_argument, NULL, 'h'},
{ NULL, 0, NULL, 0 }
};
char *inname = NULL;
float approx = 0.1;
uint32_t r = 0;
uint32_t nmax = 0;
double prob = 0;
int ch;
while ((ch = getopt_long(argc, argv, "hi:a:r:p:n:", longopts, NULL)) != -1) {
switch (ch) {
case 'h':
printf("Usage: %s [-d distance -a approx -p prob] -i <input file> -o <output file> [-h]\n", argv[0]);
printf(" -h print this help\n");
printf(" -i input file\n");
printf(" -a approximation guarantee (0.1 default)\n");
printf(" -r report every nth data point\n");
printf(" -p probability for Hoeffding bound\n");
printf(" -n maximum size of a leaf\n");
return 0;
break;
case 'n':
nmax = atoi(optarg);
break;
case 'r':
r = atoi(optarg);
break;
case 'i':
inname = optarg;
break;
case 'p':
prob = atof(optarg);
break;
case 'a':
approx = atof(optarg);
break;
}
}
if (inname == NULL) {
printf("input file not set\n");
return 2;
}
doublematrix data;
labelvector labels;
FILE *f = fopen(inname, "r");
uint32_t classcnt = read_real(f, data, labels);
fclose(f);
if (prob > 0) {
auto start = std::chrono::high_resolution_clock::now();
node *root = construct_tree(data, labels, approx, prob, classcnt, nmax);
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
set_counters(data, labels, root);
printf("%f %f %f\n", approx, entropy(root) / labels.size(), double(duration.count()) / 1000000);
}
else {
uint32_t dim = data[0].size();
attributevector l(dim);
for (uint32_t i = 0; i < l.size(); i++) {
l[i].init(approx, classcnt);
}
auto start = std::chrono::high_resolution_clock::now();
realtop ft;
for (uint32_t i = 0; i < labels.size(); i++) {
realtop t;
for (uint32_t j = 0; j < dim; j++) {
l[j].add(data[i][j], labels[i]);
double threshold;
double ent = l[j].optimal_split(threshold);
t.test(ent, std::make_pair(j, threshold));
}
ft = t;
if (r > 0 && (i + 1) % r == 0) {
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
uint32_t ccnt = 0;
for (uint32_t j = 0; j < l.size(); j++) ccnt += l[j].candcnt() - 2;
printf("%f %d %d %d %lld %d %lf\n", approx, i + 1, dim, ccnt, duration.count(), ft.i1.first, ft.s1);
}
}
if (r == 0) {
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
uint32_t ccnt = 0;
for (uint32_t j = 0; j < l.size(); j++) ccnt += l[j].candcnt() - 2;
printf("%f %lu %d %d %lld %d %lf\n", approx, labels.size(), dim, ccnt, duration.count(), ft.i1.first, ft.s1);
}
}
return 0;
}
#ifndef TOP_H
#define TOP_H
#include <limits>
template <typename T>
struct topresult
{
double s1, s2;
T i1, i2;
topresult() {s1 = s2 = std::numeric_limits<double>::infinity();}
void
test(double s, const T & i)
{
if (s < s1) {
s2 = s1;
i2 = i1;
s1 = s;
i1 = i;
}
else if (s < s2) {
s2 = s;
i2 = i;
}
}
};
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment