Skip to content
Snippets Groups Projects
Commit 5135a0b1 authored by Nikolaj Tatti's avatar Nikolaj Tatti
Browse files

sparse decision trees

parents
No related branches found
No related tags found
No related merge requests found
SRCS = sparsesplit.cpp
OBJS = $(SRCS:.cpp=.o)
CC = g++
LDFLAGS= -lm -O3
CFLAGS= -g -O3 -MMD -Wall -Wextra
all : sparsesplit
sparsesplit: $(OBJS)
$(CC) $(LDFLAGS) -o $@ $(OBJS)
%.o : %.cpp
$(CC) $(CFLAGS) -c $*.cpp
clean:
-rm *.o sparsesplit
zip:
zip sparsesplit.zip *.cpp *.h Makefile
-include $(SRCS:%.cpp=%.d)
#ifndef DTSPARSE_H
#define DTSPARSE_H
#include <stdint.h>
#include <map>
#include <list>
#include <vector>
#include <limits>
#include <math.h>
#include "top.h"
namespace sparse {
typedef std::vector<uint32_t> uintvector;
typedef std::vector<uint32_t> labelvector;
struct attribute;
typedef std::multimap<double, attribute *> attrmap;
typedef topresult<uint32_t> sparsetop;
struct bucket {
double lowerf, upperf, f;
attrmap tree, lower, upper;
double
treekey(uint32_t c, uint32_t n)
{
double ent = 0;
if (c > 0 && c < n)
ent = -(c*log(c) + (n - c)*log(n - c) - n*log(n));
if (f > 0 && f < 1)
ent += c*log(f) + (n - c)*log(1 - f);
return ent;
}
double lowerkey(uint32_t c, uint32_t n) const {return n*lowerf - c;}
double upperkey(uint32_t c, uint32_t n) const {return n*upperf - c;}
};
typedef std::map<double, bucket> bucketmap;
typedef std::pair<bucketmap::iterator, attrmap::iterator> attrloc;
typedef std::list<attrloc> attrloclist;
struct attribute {
attrloc lower, upper;
attrloclist buckets;
uint32_t c, n;
uint32_t id;
attribute(uint32_t i) : c(0), n(0), id(i) {}
void
purge()
{
if (buckets.size() > 0 && buckets.front().first->second.f > 0) purge_lower();
if (buckets.size() > 0 && buckets.back().first->second.f < 1) purge_upper();
for (attrloclist::iterator it = buckets.begin(); it != buckets.end(); ++it) {
it->first->second.tree.erase(it->second);
}
buckets = attrloclist();
}
void
purge_lower()
{
lower.first->second.lower.erase(lower.second);
}
void
purge_upper()
{
upper.first->second.upper.erase(upper.second);
}
void
push_back(bucketmap::iterator it)
{
bucket & b = it->second;
double treekey = b.treekey(c, n);
attrmap::iterator loc = b.tree.insert(std::make_pair(treekey, this));
buckets.push_back(std::make_pair(it, loc));
}
void
push_front(bucketmap::iterator it)
{
bucket & b = it->second;
double treekey = b.treekey(c, n);
attrmap::iterator loc = b.tree.insert(std::make_pair(treekey, this));
buckets.push_front(std::make_pair(it, loc));
}
void
set_lower(bucketmap::iterator it)
{
bucket & b = it->second;
double key = b.lowerkey(c, n);
attrmap::iterator l = b.lower.insert(std::make_pair(key, this));
lower = std::make_pair(it, l);
}
void
set_upper(bucketmap::iterator it)
{
bucket & b = it->second;
double key = b.upperkey(c, n);
attrmap::iterator l = b.upper.insert(std::make_pair(key, this));
upper = std::make_pair(it, l);
}
};
typedef std::map<uint32_t, attribute> attrpool;
class leaf {
public:
leaf(double appr) : m_approx(appr), m_n(0), m_c(0) {}
uint32_t dim() const {return m_attributes.size();}
uint32_t bucketcnt() const {return m_buckets.size();}
uint32_t size() const {return m_n;}
void
process(const uintvector &attrs, bool label, bool update_buckets)
{
m_n++;
if (label) m_c++;
for (uint32_t i = 0; i < attrs.size(); i++) {
uint32_t j = attrs[i];
attrpool::iterator it = m_attributes.find(j);
if (it == m_attributes.end())
it = m_attributes.insert(std::make_pair(j, attribute(j))).first;
attribute &a = it->second;
a.n++;
if (label) a.c++;
if (update_buckets) update(a);
}
for (bucketmap::iterator it = m_buckets.begin(); it != m_buckets.end(); ++it) {
double j = it->first;
bucket &b = it->second;
if (j == -std::numeric_limits<double>::infinity()) {
while (b.upper.size() > 0) {
attribute &a = *b.upper.rbegin()->second;
if (a.c == m_c) break;
update(a);
}
}
else if (j == std::numeric_limits<double>::infinity()) {
while (b.lower.size() > 0) {
attribute &a = *b.lower.begin()->second;
if (a.n - a.c == m_n - m_c) break;
update(a);
}
}
else {
double threshold = b.upperkey(m_c, m_n);
while (b.upper.size() > 0) {
attrmap::reverse_iterator ait = b.upper.rbegin();
if (j < 0 && ait->first < threshold) break;
if (j >= 0 && ait->first <= threshold) break;
expand_upper(*ait->second);
}
threshold = b.lowerkey(m_c, m_n);
while (b.lower.size() > 0) {
attrmap::iterator ait = b.lower.begin();
if (j <= 0 && ait->first >= threshold) break;
if (j > 0 && ait->first > threshold) break;
expand_lower(*ait->second);
}
}
}
}
void
clean()
{
bucketmap::iterator it, itnext;
for (it = m_buckets.begin(); it != m_buckets.end(); it = itnext) {
itnext = it;
++itnext;
if (it->second.tree.empty()) {
m_buckets.erase(it);
}
}
}
sparsetop
truebest()
{
sparsetop t;
for (attrpool::iterator it = m_attributes.begin(); it != m_attributes.end(); ++it) {
t.test(entropy(it->second), it->first);
}
return t;
}
sparsetop
best()
{
sparsetop t;
for (bucketmap::iterator it = m_buckets.begin(); it != m_buckets.end(); ++it) {
attrmap::iterator ait = it->second.tree.begin();
if (ait == it->second.tree.end()) continue;
t.test(entropy(*ait->second), ait->second->id);
++ait;
if (ait == it->second.tree.end()) continue;
t.test(entropy(*ait->second), ait->second->id);
}
return t;
}
protected:
double
index(uint32_t c, uint32_t n) const
{
if (c == 0) return -std::numeric_limits<double>::infinity();
if (c == n) return std::numeric_limits<double>::infinity();
if (2*c <= n)
return -ceil(log((log(n) - log(c)) / log(2)) / log(1 + m_approx));
else
return ceil(log((log(n) - log(n - c)) / log(2)) / log(1 + m_approx));
}
double
freq(double j) const
{
if (j == -std::numeric_limits<double>::infinity()) return 0;
if (j == std::numeric_limits<double>::infinity()) return 1;
if (j < 0)
return pow(0.5, pow(1 + m_approx, -j));
else
return 1 - pow(0.5, pow(1 + m_approx, j));
}
double index(const attribute &a) const {return index(m_c - a.c, m_n - a.n);}
bucketmap::iterator
get(double j)
{
bucketmap::iterator it = m_buckets.find(j);
if (it == m_buckets.end()) {
it = m_buckets.insert(std::make_pair(j, bucket())).first;
it->second.f = freq(j);
it->second.lowerf = (j <= 0 ? freq(j) : freq(j - 1));
it->second.upperf = (j >= 0 ? freq(j) : freq(j + 1));
}
return it;
}
void
update(attribute &a)
{
a.purge();
double j = index(a);
bucketmap::iterator it = get(j);
a.push_back(it);
if (j > -std::numeric_limits<double>::infinity())
a.set_lower(it);
if (j < std::numeric_limits<double>::infinity())
a.set_upper(it);
}
void
expand_lower(attribute &a)
{
a.purge_lower();
double j = index(a);
bucketmap::iterator it = a.buckets.front().first;
bool first = true;
while (first || j < it->first - 0.5) { // 0.5 just for the numerical paranoia
bucketmap::iterator prev = it;
if (it == m_buckets.begin()) {
prev = get(it->first - 1);
}
else {
--prev;
if (prev->first < it->first - 1.5) prev = get(it->first - 1); // missing bucket, so generate new one
}
a.push_front(prev);
it = prev;
first = false;
}
a.set_lower(it);
}
void
expand_upper(attribute &a)
{
a.purge_upper();
assert(a.c <= a.n);
double j = index(a);
bucketmap::iterator it = a.buckets.back().first;
bool first = true;
while (first || j > it->first + 0.5) { // 0.5 just for the numerical paranoia
bucketmap::iterator next = it;
++next;
if (next == m_buckets.end() || next->first > it->first + 1.5)
next = get(it->first + 1); // missing bucket, so generate new one
a.push_back(next);
it = next;
first = false;
}
a.set_upper(it);
}
double
entropy(uint32_t c, uint32_t n)
const
{
if (c > 0 && c < n) {
return -(c*log(c) + (n - c)*log(n - c) - n*log(n));
}
return 0;
}
double
entropy(const attribute & a)
const
{
return entropy(a.c, a.n) + entropy(m_c - a.c, m_n - a.n);
}
double m_approx;
uint32_t m_n, m_c;
bucketmap m_buckets;
attrpool m_attributes;
};
};
#endif
#include "sparse.h"
#include <getopt.h>
#include <chrono>
typedef std::vector<sparse::uintvector> uintmatrix;
struct node {
node(double a): l(a), pos(0), neg(0), c(0), n(0) {}
sparse::leaf l;
uint32_t cond;
node *pos, *neg;
uint32_t c, n;
};
void
read_sparse(FILE *f, uintmatrix & d, sparse::labelvector & labels)
{
uint32_t dim;
fscanf(f, "%d", &dim);
d.resize(dim);
labels.resize(dim);
for (uint32_t i = 0; i < dim; i++) {
uint32_t cnt, label;
fscanf(f, "%d%d", &label, &cnt);
labels[i] = label;
d[i].resize(cnt);
for (uint32_t j = 0; j < cnt; j++) {
uint32_t ind;
fscanf(f, "%d", &ind);
d[i][j] = ind;
}
}
}
double
entropy(node *cur)
{
if (cur->pos == 0) {
if (cur->c > 0 && cur->c < cur->n) {
uint32_t c1 = cur->c;
uint32_t c2 = cur->n - cur->c;
uint32_t n = cur->n;
return -(c1*log(c1) + c2*log(c2) - n*log(n));
}
return 0;
}
else
return entropy(cur->neg) + entropy(cur->pos);
}
void
set_counters(const uintmatrix & d, const sparse::labelvector & labels, node *root)
{
for (uint32_t i = 0; i < labels.size(); i++) {
node *cur = root;
while (cur->pos) {
uint32_t j;
for (j = 0; j < d[i].size(); j++)
if (d[i][j] == cur->cond) break;
if (j < d[i].size())
cur = cur->pos;
else
cur = cur->neg;
}
cur->n++;
if (labels[i]) cur->c++;
}
}
node *
construct_tree(const uintmatrix & d, const sparse::labelvector & labels, double approx, bool tb, double prob, uint32_t maxn)
{
node *root = new node(approx);
for (uint32_t i = 0; i < labels.size(); i++) {
sparse::sparsetop st;
node *cur = root;
while (cur->pos) {
uint32_t j;
for (j = 0; j < d[i].size(); j++)
if (d[i][j] == cur->cond) break;
if (j < d[i].size())
cur = cur->pos;
else
cur = cur->neg;
}
cur->l.process(d[i], labels[i], !tb);
cur->l.clean();
if (tb)
st = cur->l.truebest();
else
st = cur->l.best();
uint32_t n = cur->l.size();
//double threshold = (6*(2*(1 + log(n)) + log(2) + log(n)) + 2*log(2))*sqrt(log(1 / prob) / (2 * n));
double threshold = log(2)*sqrt(log(1 / prob) / (2 * n));
if (st.s2 - st.s1 >= threshold * n || (maxn > 0 && n >= maxn)) {
printf("split %d\n", n);
cur->cond = st.i1;
cur->pos = new node(approx);
cur->neg = new node(approx);
}
}
return root;
}
int
main(int argc, char **argv)
{
static struct option longopts[] = {
{"approx", required_argument, NULL, 'a'},
{"out", required_argument, NULL, 'o'},
{"in", required_argument, NULL, 'i'},
{"help", no_argument, NULL, 'h'},
{ NULL, 0, NULL, 0 }
};
char *inname = NULL;
double approx = 0.1;
double prob = 0.0;
bool tb = false;
uint32_t report = 0;
uint32_t maxn = 0;
int ch;
while ((ch = getopt_long(argc, argv, "hi:a:tr:p:n:", longopts, NULL)) != -1) {
switch (ch) {
case 'h':
printf("Usage: %s [-a approx -p prob] -i <input file> -o <output file> [-h]\n", argv[0]);
printf(" -h print this help\n");
printf(" -i input file\n");
printf(" -a approximation guarantee (0.1 default)\n");
printf(" -t baseline algorithm\n");
printf(" -r report every nth data point\n");
printf(" -p probability for Hoeffding bound\n");
printf(" -n maximum size of a leaf\n");
return 0;
break;
case 'r':
report = atoi(optarg);
break;
case 't':
tb = true;
break;
case 'i':
inname = optarg;
break;
case 'p':
prob = atof(optarg);
break;
case 'a':
approx = atof(optarg);
break;
case 'n':
maxn = atoi(optarg);
break;
}
}
if (inname == NULL) {
printf("input file not set\n");
return 2;
}
uintmatrix data;
sparse::labelvector labels;
FILE *f = fopen(inname, "r");
read_sparse(f, data, labels);
fclose(f);
if (prob > 0) {
auto start = std::chrono::high_resolution_clock::now();
node *root = construct_tree(data, labels, approx, tb, prob, maxn);
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
set_counters(data, labels, root);
printf("%f %.2f %.2f\n", approx, entropy(root) / labels.size(), double(duration.count()) / 1000000.0);
}
else {
sparse::leaf l(approx);
uint64_t nnz = 0;
auto start = std::chrono::high_resolution_clock::now();
sparse::sparsetop st;
for (uint32_t i = 0; i < labels.size(); i++) {
nnz += data[i].size();
l.process(data[i], labels[i], !tb);
l.clean();
if (tb)
st = l.truebest();
else
st = l.best();
if (report > 0 && (i + 1) % report == 0) {
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
printf("%f %lu %d %llu %d %lld %d %lf\n", approx, i + 1, l.dim(), nnz, l.bucketcnt(), duration.count(), st.i1, st.s1);
}
}
if (report == 0) {
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
printf("%f %lu %d %llu %d %lld %d %lf\n", approx, labels.size(), l.dim(), nnz, l.bucketcnt(), duration.count(), st.i1, st.s1);
}
}
return 0;
}
#ifndef TOP_H
#define TOP_H
#include <limits>
template <typename T>
struct topresult
{
double s1, s2;
T i1, i2;
topresult() {s1 = s2 = std::numeric_limits<double>::infinity();}
void
test(double s, const T & i)
{
if (s < s1) {
s2 = s1;
i2 = i1;
s1 = s;
i1 = i;
}
else if (s < s2 && i1 != i) { // additional test because we may have several tests with the same feature
s2 = s;
i2 = i;
}
}
};
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment