diff --git a/real/Makefile b/real/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..2b4dd9abac4857eaca7d9a7ccca5a1857952ddd5 --- /dev/null +++ b/real/Makefile @@ -0,0 +1,24 @@ +SRCS = realsplit.cpp real.cpp +OBJS = $(SRCS:.cpp=.o) + +CC = g++ + +LDFLAGS= -lm -O3 +CFLAGS= -g -O3 -MMD -Wall -Wextra + +all : realsplit + +realsplit: $(OBJS) + $(CC) $(LDFLAGS) -o $@ $(OBJS) + + +%.o : %.cpp + $(CC) $(CFLAGS) -c $*.cpp + +clean: + -rm *.o realsplit + +zip: + zip realsplit.zip *.cpp *.h Makefile + +-include $(SRCS:%.cpp=%.d) diff --git a/real/real.cpp b/real/real.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1e90d2654ba39637d9463f6eb411df4b2beb0320 --- /dev/null +++ b/real/real.cpp @@ -0,0 +1,69 @@ +#include "real.h" + +void +attribute::init(double approx, uint32_t classcnt) +{ + m_cnt = 0; + m_approx = approx; + m_candidates.push_back(candidate()); + m_candidates.push_back(candidate()); + + candidate &front = m_candidates.front(); + candidate &back = m_candidates.back(); + + front.init(classcnt); + back.init(classcnt); + + front.threshold = m_values.insert(std::make_pair(KEYMIN, labellist())).first; + back.threshold = m_values.insert(std::make_pair(KEYMAX, labellist())).first; +} + + +void +attribute::add(double x, uint32_t k) +{ + m_cnt++; + m_values[x].push_back(k); + + for (candlist::iterator it = m_candidates.begin(); it != m_candidates.end(); ++it) { + if (x <= it->threshold->first) + it->ent1.change(k, 1); + else + it->ent2.change(k, 1); + } + + candlist::iterator base = m_candidates.begin(); + candlist::iterator cand = base; + for (++cand; cand != m_candidates.end(); ++cand) { + while (true) { + if (cand->ent1.value() <= (1 + m_approx)*base->ent1.value()) break; + + attrmap::iterator t = cand->threshold; + --t; + if (t == base->threshold) break; + + + if (cand->threshold->first == KEYMAX) { + m_candidates.push_back(*cand); + } + + cand->step(); + } + ++base; + } +} + +double +attribute::optimal_split(double & threshold) +{ + double score = std::numeric_limits<double>::infinity(); + for (candlist::iterator it = m_candidates.begin(); it != m_candidates.end(); ++it) { + double cand = it->value(); + if (cand < score) { + score = cand; + threshold = it->threshold->first; + } + } + + return score; +} diff --git a/real/real.h b/real/real.h new file mode 100644 index 0000000000000000000000000000000000000000..c0f1bea56317385536e7afa3645fad7b3f34487b --- /dev/null +++ b/real/real.h @@ -0,0 +1,95 @@ +#ifndef DTREAL_H +#define DTREAL_H + +#include <stdint.h> +#include <vector> +#include <map> +#include <list> +#include <limits> +#include <math.h> + +typedef std::vector<uint32_t> labelvector; +typedef std::list<uint32_t> labellist; +typedef std::map<double, labellist> attrmap; + +const static double KEYMAX = std::numeric_limits<double>::infinity(); +const static double KEYMIN = -std::numeric_limits<double>::infinity(); + +struct entropy +{ + labelvector lab; + uint32_t n; + double v; + + void + init(uint32_t k) + { + lab = labelvector(k); + n = 0; + v = 0; + } + + + void + change(uint32_t k, int32_t c) + { + if (lab[k] > 0) v += lab[k]*log(lab[k]); + n += c; + lab[k] += c; + if (lab[k] > 0) v -= lab[k]*log(lab[k]); + } + + double value() const {return n > 0 ? v + n*log(n) : 0;} + +}; + +struct candidate { + entropy ent1, ent2; + attrmap::iterator threshold; + + void + init(uint32_t k) + { + ent1.init(k); + ent2.init(k); + } + + double value() {return ent1.value() + ent2.value();} + + void step() + { + labellist & labels = threshold->second; + for (labellist::iterator it = labels.begin(); it != labels.end(); ++it) { + ent1.change(*it, -1); + ent2.change(*it, 1); + } + --threshold; + } +}; + +typedef std::list<candidate> candlist; + + +class attribute { + public: + void init(double approx, uint32_t classcnt); + + void add(double x, uint32_t k); + + double optimal_split(double & threshold); + + uint32_t candcnt() const {return m_candidates.size();} + uint32_t size() const {return m_cnt;} + + protected: + double m_approx; + candlist m_candidates; + attrmap m_values; + uint32_t m_cnt; +}; + +typedef std::vector<attribute> attributevector; + + + +#endif diff --git a/real/realsplit.cpp b/real/realsplit.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9c0acea5ecf02b71d36d3a6a173117b3fc56fc97 --- /dev/null +++ b/real/realsplit.cpp @@ -0,0 +1,249 @@ +#include "real.h" +#include "top.h" +#include <getopt.h> +#include <chrono> + +typedef std::vector<double> doublevector; + +typedef std::vector<doublevector> doublematrix; + +typedef topresult<std::pair<uint32_t, double> > realtop; + +struct node { + node(double a, uint32_t dim, uint32_t ccnt): attrs(dim), counts(ccnt), pos(0), neg(0) + { + for (uint32_t i = 0; i < dim; i++) attrs[i].init(a, ccnt); + } + + attributevector attrs; + labelvector counts; + + uint32_t cond; + double thresh; + node *pos, *neg; +}; + + + +node * +construct_tree(const doublematrix & d, const labelvector & labels, double approx, double prob, uint32_t classcnt, uint32_t nmax) +{ + uint32_t dim = d[0].size(); + node *root = new node(approx, dim, classcnt); + for (uint32_t i = 0; i < labels.size(); i++) { + node *cur = root; + while (cur->pos) { + if (d[i][cur->cond] <= cur->thresh) + cur = cur->pos; + else + cur = cur->neg; + } + + realtop t; + for (uint32_t j = 0; j < dim; j++) { + cur->attrs[j].add(d[i][j], labels[i]); + double threshold; + double ent = cur->attrs[j].optimal_split(threshold); + t.test(ent, std::make_pair(j, threshold)); + } + + + uint32_t n = cur->attrs[0].size(); + //double threshold = (6*(2*(1 + log(n)) + log(2) + log(n)) + 2*log(2))*sqrt(log(1 / prob) / (2 * n)); + double threshold = log(2)*sqrt(log(1 / prob) / (2 * n)); + + if (t.s2 - t.s1 >= threshold * n || (nmax > 0 && n >= nmax)) { + printf("split %d %f %d %d %f\n", n, t.s2 - t.s1, i, t.i1.first, t.i1.second); + cur->cond = t.i1.first; + cur->thresh = t.i1.second; + cur->pos = new node(approx, dim, classcnt); + cur->neg = new node(approx, dim, classcnt); + } + + } + + return root; +} + +double +entropy(node *cur) +{ + if (cur->pos == 0) { + double ent = 0; + uint32_t n = 0; + for (uint32_t i = 0; i < cur->counts.size(); i++) { + if (cur->counts[i] == 0) continue; + ent -= cur->counts[i]*log(cur->counts[i]); + n += cur->counts[i]; + } + if (n > 0) ent += n*log(n); + return ent; + } + else + return entropy(cur->neg) + entropy(cur->pos); +} + +void +set_counters(const doublematrix & d, const labelvector & labels, node *root) +{ + for (uint32_t i = 0; i < labels.size(); i++) { + node *cur = root; + while (cur->pos) { + if (d[i][cur->cond] <= cur->thresh) + cur = cur->pos; + else + cur = cur->neg; + } + cur->counts[labels[i]]++; + } +} + + +uint32_t +read_real(FILE *f, doublematrix & d, labelvector & labels) +{ + uint32_t cnt, dim, classcnt; + fscanf(f, "%d %d", &cnt, &dim); + + d.resize(cnt); + labels.resize(cnt); + + classcnt = 0; + + for (uint32_t i = 0; i < cnt; i++) { + uint32_t label; + fscanf(f, "%d", &label); + labels[i] = label; + d[i].resize(dim); + if (label > classcnt) classcnt = label; + for (uint32_t j = 0; j < dim; j++) { + double v; + fscanf(f, "%lf", &v); + d[i][j] = v; + } + } + + return classcnt + 1; +} + +int +main(int argc, char **argv) +{ + static struct option longopts[] = { + {"approx", required_argument, NULL, 'a'}, + {"out", required_argument, NULL, 'o'}, + {"in", required_argument, NULL, 'i'}, + {"help", no_argument, NULL, 'h'}, + { NULL, 0, NULL, 0 } + }; + + char *inname = NULL; + float approx = 0.1; + uint32_t r = 0; + uint32_t nmax = 0; + double prob = 0; + + int ch; + while ((ch = getopt_long(argc, argv, "hi:a:r:p:n:", longopts, NULL)) != -1) { + switch (ch) { + case 'h': + printf("Usage: %s [-d distance -a approx -p prob] -i <input file> -o <output file> [-h]\n", argv[0]); + printf(" -h print this help\n"); + printf(" -i input file\n"); + printf(" -a approximation guarantee (0.1 default)\n"); + printf(" -r report every nth data point\n"); + printf(" -p probability for Hoeffding bound\n"); + printf(" -n maximum size of a leaf\n"); + return 0; + break; + case 'n': + nmax = atoi(optarg); + break; + case 'r': + r = atoi(optarg); + break; + case 'i': + inname = optarg; + break; + case 'p': + prob = atof(optarg); + break; + case 'a': + approx = atof(optarg); + break; + } + } + + + if (inname == NULL) { + printf("input file not set\n"); + return 2; + } + + doublematrix data; + labelvector labels; + + FILE *f = fopen(inname, "r"); + uint32_t classcnt = read_real(f, data, labels); + fclose(f); + + if (prob > 0) { + auto start = std::chrono::high_resolution_clock::now(); + node *root = construct_tree(data, labels, approx, prob, classcnt, nmax); + auto stop = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start); + set_counters(data, labels, root); + printf("%f %f %f\n", approx, entropy(root) / labels.size(), double(duration.count()) / 1000000); + + } + else { + + uint32_t dim = data[0].size(); + + attributevector l(dim); + + + + for (uint32_t i = 0; i < l.size(); i++) { + l[i].init(approx, classcnt); + } + + + auto start = std::chrono::high_resolution_clock::now(); + realtop ft; + for (uint32_t i = 0; i < labels.size(); i++) { + realtop t; + for (uint32_t j = 0; j < dim; j++) { + l[j].add(data[i][j], labels[i]); + double threshold; + double ent = l[j].optimal_split(threshold); + t.test(ent, std::make_pair(j, threshold)); + } + ft = t; + if (r > 0 && (i + 1) % r == 0) { + auto stop = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start); + + uint32_t ccnt = 0; + for (uint32_t j = 0; j < l.size(); j++) ccnt += l[j].candcnt() - 2; + printf("%f %d %d %d %lld %d %lf\n", approx, i + 1, dim, ccnt, duration.count(), ft.i1.first, ft.s1); + } + } + if (r == 0) { + auto stop = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start); + + uint32_t ccnt = 0; + for (uint32_t j = 0; j < l.size(); j++) ccnt += l[j].candcnt() - 2; + printf("%f %lu %d %d %lld %d %lf\n", approx, labels.size(), dim, ccnt, duration.count(), ft.i1.first, ft.s1); + } + + } + + + return 0; +} + + + + diff --git a/real/top.h b/real/top.h new file mode 100644 index 0000000000000000000000000000000000000000..53575345d9a244be8dc3ebb9ef8b9e810ec5a99d --- /dev/null +++ b/real/top.h @@ -0,0 +1,32 @@ +#ifndef TOP_H +#define TOP_H + +#include <limits> + + +template <typename T> +struct topresult +{ + double s1, s2; + T i1, i2; + + topresult() {s1 = s2 = std::numeric_limits<double>::infinity();} + + void + test(double s, const T & i) + { + if (s < s1) { + s2 = s1; + i2 = i1; + s1 = s; + i1 = i; + } + else if (s < s2) { + s2 = s; + i2 = i; + } + } +}; + + +#endif