Skip to content

Commit 942afa3

Browse files
author
atksh
committed
double to float. more parallel for
1 parent 69b9287 commit 942afa3

8 files changed

Lines changed: 128 additions & 99 deletions

File tree

cpp/main.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ PYBIND11_MODULE(PRTree, m) {
1717
)pbdoc";
1818

1919
py::class_<PRTree<T, B>>(m, "PRTree")
20-
.def(py::init<py::array_t<T>, py::array_t<double>>(), R"pbdoc(
20+
.def(py::init<py::array_t<T>, py::array_t<float>>(), R"pbdoc(
2121
Construct PRTree with init.
2222
)pbdoc")
2323
.def(py::init<>(), R"pbdoc(

cpp/parallel.h

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#pragma once
2+
#include <vector>
3+
#include <mutex>
4+
#include <thread>
5+
#include <algorithm>
6+
#include <iostream>
7+
8+
9+
template<typename U>
10+
using vec = std::vector<U>;
11+
12+
template<typename F, typename Iter, typename T>
13+
void parallel_for_each(const Iter first, const Iter last, T& result, const F& func){
14+
auto f = std::ref(func);
15+
const int nthreads = std::max(1, (int) std::thread::hardware_concurrency());
16+
const size_t total = std::distance(first, last);
17+
vec<T> rr(nthreads);
18+
{
19+
vec<std::thread> threads;
20+
vec<Iter> iters;
21+
auto step = total / nthreads;
22+
auto remaining = total % nthreads;
23+
auto n = first;
24+
iters.emplace_back(first);
25+
for (int i = 0; i < nthreads - 1; ++i){
26+
std::advance(n, i<remaining?step+1:step);
27+
iters.emplace_back(n);
28+
}
29+
iters.emplace_back(last);
30+
31+
result.reserve(total);
32+
for (auto&& r : rr){
33+
r.reserve(total/nthreads+1);
34+
}
35+
for (int t = 0; t < nthreads; t++){
36+
threads.emplace_back(std::thread([&,t]{std::for_each(iters[t], iters[t+1], [&](auto& x){f(x, rr[t]);});}));
37+
}
38+
std::for_each(threads.begin(), threads.end(), [&](std::thread& x){x.join();});
39+
}
40+
for (int t = 0; t < nthreads; t++){
41+
result.insert(result.end(),
42+
std::make_move_iterator(rr[t].begin()),
43+
std::make_move_iterator(rr[t].end()));
44+
}
45+
}
46+
47+
48+
template<typename F, typename Iter>
49+
void parallel_for_each(const Iter first, const Iter last, const F& func){
50+
auto f = std::ref(func);
51+
const int nthreads = std::max(1, (int) std::thread::hardware_concurrency());
52+
const size_t total = std::distance(first, last);
53+
{
54+
vec<std::thread> threads;
55+
vec<Iter> iters;
56+
auto step = total / nthreads;
57+
auto remaining = total % nthreads;
58+
auto n = first;
59+
iters.emplace_back(first);
60+
for (int i = 0; i < nthreads - 1; ++i){
61+
std::advance(n, i<remaining?step+1:step);
62+
iters.emplace_back(n);
63+
}
64+
iters.emplace_back(last);
65+
for (int t = 0; t < nthreads; t++){
66+
threads.emplace_back(std::thread([&,t]{std::for_each(iters[t], iters[t+1], [&](auto& x){f(x);});}));
67+
}
68+
std::for_each(threads.begin(), threads.end(), [&](std::thread& x){x.join();});
69+
}
70+
}

cpp/prtree.h

Lines changed: 43 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <mutex>
1717
#include <thread>
1818
#include <future>
19+
#include <functional>
1920
#include <cstdlib>
2021
#include <pybind11/pybind11.h>
2122
#include <pybind11/numpy.h>
@@ -29,6 +30,7 @@
2930
#include <cereal/types/vector.hpp>
3031
#include <cereal/types/unordered_map.hpp>
3132

33+
#include "parallel.h"
3234

3335

3436
namespace py = pybind11;
@@ -69,12 +71,12 @@ class Uncopyable {
6971

7072
class BB{
7173
public:
72-
double xmin, xmax, ymin, ymax;
74+
float xmin, xmax, ymin, ymax;
7375
BB() {
7476
clear();
7577
}
7678

77-
BB(const double& _xmin, const double& _xmax, const double& _ymin, const double& _ymax){
79+
BB(const float& _xmin, const float& _xmax, const float& _ymin, const float& _ymax){
7880
if (unlikely(_xmin > _xmax || _ymin > _ymax)){
7981
throw std::runtime_error("Impossible rectange was given. xmin < xmax and ymin < ymax must be satisfied.");
8082
}
@@ -103,7 +105,7 @@ class BB{
103105
return *this;
104106
}
105107

106-
inline void expand(const double& dx, const double& dy){
108+
inline void expand(const float& dx, const float& dy){
107109
xmin -= dx;
108110
xmax += dx;
109111
ymin -= dy;
@@ -116,7 +118,7 @@ class BB{
116118
return unlikely(c1 && c2);
117119
}
118120

119-
inline double operator [](const int& i)const{
121+
inline float operator [](const int& i)const{
120122
if (i == 0){
121123
return xmin;
122124
} else if (i == 1){
@@ -388,25 +390,15 @@ class PseudoPRTree : Uncopyable{
388390
}
389391

390392
std::pair<DataType<int>*, DataType<int>*> as_X(void* placement, const int hint){
391-
const size_t nthreads = std::max(1, (int) std::thread::hardware_concurrency());
392393
DataType<int> *b, *e;
393394
auto children = get_all_leaves(hint);
394395
int total = children.size();
395396
b = (DataType<int>*)placement;
396397
e = b + total;
397-
{
398-
vec<std::thread> threads(nthreads);
399-
for (int t = 0; t < nthreads; t++){
400-
threads[t] = std::thread(std::bind(
401-
[&](const int bi, const int ei, const int t){
402-
for (int i = bi; i < ei; i++){
403-
new(b + i) DataType<int>{i, children[i]->mbb};
404-
}
405-
}, t * total/nthreads, unlikely((t+1)==nthreads)?total:(t+1)*total/nthreads, t)
406-
);
407-
}
408-
std::for_each(threads.begin(), threads.end(), [&](std::thread& x){x.join();});
409-
}
398+
parallel_for_each(b, e, [&](auto& p){
399+
int i = &p - b;
400+
new(b + i) DataType<int>{i, children[i]->mbb};
401+
});
410402
return {b, e};
411403
}
412404
};
@@ -488,8 +480,7 @@ class PRTree : Uncopyable{
488480
}
489481
}
490482

491-
492-
PRTree(const py::array_t<T>& idx, const py::array_t<double>& x){
483+
PRTree(const py::array_t<T>& idx, const py::array_t<float>& x){
493484
const auto &buff_info_idx = idx.request();
494485
const auto &shape_idx = buff_info_idx.shape;
495486
const auto &buff_info_x = x.request();
@@ -505,22 +496,30 @@ class PRTree : Uncopyable{
505496
DataType<T> *b, *e;
506497
void *placement = std::malloc(std::max(sizeof(DataType<T>), sizeof(DataType<int>)) * length);
507498
b = (DataType<T>*)placement;
508-
e = b;
509-
for (size_t i = 0; i < length; i++){
510-
auto bb = BB(*x.data(i, 0), *x.data(i, 1), *x.data(i, 2), *x.data(i, 3));
499+
e = b + length;
511500

512-
new(e) DataType<T>{*idx.data(i), bb};
513-
e++;
501+
parallel_for_each(b, e,
502+
[&](auto& it){
503+
int i = &it - b;
504+
auto bb = BB(*x.data(i, 0), *x.data(i, 1), *x.data(i, 2), *x.data(i, 3));
505+
new(b + i) DataType<T>{*idx.data(i), bb};
506+
});
514507

508+
auto build_umap = [&]{for (size_t i = 0; i < length; i++){
509+
auto bb = BB(*x.data(i, 0), *x.data(i, 1), *x.data(i, 2), *x.data(i, 3));
515510
umap.emplace_hint(umap.end(), *idx.data(i), std::move(bb));
516-
}
511+
}};
512+
517513
//ProfilerStart("construct.prof");
518-
build(b, e, placement);
514+
auto t1 = std::thread(build_umap);
515+
auto t2 = std::thread([&]{build(b, e, placement);});
516+
t1.join();
517+
t2.join();
519518
//ProfilerStop();
520519
std::free(placement);
521520
}
522521

523-
void insert(const T& idx, const py::array_t<double>& x){
522+
void insert(const T& idx, const py::array_t<float>& x){
524523
vec<Leaf<T, B>*> cands;
525524
std::queue<PRTreeNode<T, B>*, std::deque<PRTreeNode<T, B>*>> que;
526525
BB bb;
@@ -538,9 +537,9 @@ class PRTree : Uncopyable{
538537
}
539538

540539
bb = BB(*x.data(0), *x.data(1), *x.data(2), *x.data(3));
541-
double dx = bb.xmax - bb.xmin + 0.000000001;
542-
double dy = bb.ymax - bb.ymin + 0.000000001;
543-
double c = 0.0;
540+
float dx = bb.xmax - bb.xmin + 0.000000001;
541+
float dy = bb.ymax - bb.ymin + 0.000000001;
542+
float c = 0.0;
544543
std::stack<PRTreeNode<T, B>*> sta;
545544
while (likely(cands.size() == 0)){
546545
while (likely(!sta.empty())){
@@ -600,16 +599,17 @@ class PRTree : Uncopyable{
600599

601600
template<class iterator>
602601
void build(iterator b, iterator e, void *placement){
603-
const int nthreads = std::max(1, (int) std::thread::hardware_concurrency());
604602
vec<Leaf<int, B>*> leaves;
605603
vec<std::unique_ptr<PRTreeNode<T, B>>> tmp_nodes, prev_nodes;
606604
std::unique_ptr<PRTreeNode<T, B>> p, q, r;
607605

608606
auto first_tree = PseudoPRTree<T, B>(b, e);
609-
for (auto& leaf : first_tree.get_all_leaves(e - b)){
610-
p = std::make_unique<PRTreeNode<T, B>>(leaf);
611-
prev_nodes.emplace_back(std::move(p));
612-
}
607+
auto first_leaves = first_tree.get_all_leaves(e - b);
608+
parallel_for_each(first_leaves.begin(), first_leaves.end(), prev_nodes,
609+
[&](auto& leaf, auto& o){
610+
auto pp = std::make_unique<PRTreeNode<T, B>>(leaf);
611+
o.emplace_back(std::move(pp));
612+
});
613613

614614
auto [bb, ee] = first_tree.as_X(placement, e - b);
615615
while (likely(prev_nodes.size() > 1)){
@@ -619,18 +619,9 @@ class PRTree : Uncopyable{
619619
tmp_nodes.clear();
620620
tmp_nodes.reserve(leaves_size);
621621

622-
vec<decltype(tmp_nodes)> out_privates(nthreads);
623-
{
624-
vec<std::thread> threads(nthreads);
625-
for (auto& o : out_privates){
626-
o.reserve(leaves_size/nthreads*2);
627-
}
628-
for (int t = 0; t < nthreads; t++){
629-
threads[t] = std::thread(std::bind(
630-
[&](const int bi, const int ei, const int t){
631-
for (size_t k = bi; k < ei; k++){
622+
parallel_for_each(leaves.begin(), leaves.end(), tmp_nodes,
623+
[&](auto& leaf, auto& o){
632624
int idx, jdx;
633-
auto& leaf = leaves[k];
634625
int len = leaf->data.size();
635626
auto pp = std::make_unique<PRTreeNode<T, B>>(leaf->mbb);
636627
if (likely(!leaf->data.empty())){
@@ -642,21 +633,11 @@ class PRTree : Uncopyable{
642633
idx = leaf->data[0].first;
643634
pp->head = std::move(prev_nodes[idx]);
644635
if (!pp->head){throw std::runtime_error("ppp");}
645-
out_privates[t].emplace_back(std::move(pp));
636+
o.emplace_back(std::move(pp));
646637
} else {
647638
throw std::runtime_error("what????");
648639
}
649-
}
650-
}, t*leaves_size/nthreads, unlikely((t+1)==nthreads)?leaves_size:(t+1)*leaves_size/nthreads, t));
651-
}
652-
std::for_each(threads.begin(), threads.end(), [&](std::thread& x){x.join();});
653-
654-
for (int t = 0; t < nthreads; t++){
655-
tmp_nodes.insert(tmp_nodes.end(),
656-
std::make_move_iterator(out_privates[t].begin()),
657-
std::make_move_iterator(out_privates[t].end()));
658-
}
659-
}
640+
});
660641

661642
{
662643
auto tmp = tree.as_X(placement, ee - bb);
@@ -687,7 +668,7 @@ class PRTree : Uncopyable{
687668
}
688669

689670

690-
auto find_all(const py::array_t<double>& x){
671+
auto find_all(const py::array_t<float>& x){
691672
const auto &buff_info_x = x.request();
692673
const auto &ndim= buff_info_x.ndim;
693674
const auto &shape_x = buff_info_x.shape;
@@ -711,35 +692,13 @@ class PRTree : Uncopyable{
711692
}
712693
}
713694
size_t length = X.size();
714-
const int nthreads = std::max(1, (int) std::thread::hardware_concurrency());
715-
vec<vec<vec<T>>> out_privates(nthreads);
716-
{
717-
vec<std::thread> threads(nthreads);
718-
for (auto& o : out_privates){
719-
o.reserve(length/nthreads+1);
720-
}
721-
for (int t = 0; t < nthreads; t++){
722-
threads[t] = std::thread(std::bind(
723-
[&](const int bi, const int ei, const int t){
724-
for (int i = bi; i < ei; i++){
725-
out_privates[t].emplace_back(std::move(find(X[i])));
726-
}
727-
}, t*length/nthreads, unlikely((t+1)==nthreads)?length:(t+1)*length/nthreads, t));
728-
}
729-
std::for_each(threads.begin(), threads.end(), [&](std::thread& x){x.join();});
730-
}
731695
vec<vec<T>> out;
732696
out.reserve(length);
733-
for (int t = 0; t < nthreads; t++){
734-
out.insert(out.end(),
735-
std::make_move_iterator(out_privates[t].begin()),
736-
std::make_move_iterator(out_privates[t].end()));
737-
vec<vec<T>>().swap(out_privates[t]);
738-
}
697+
parallel_for_each(X.begin(), X.end(), out, [&](const BB& x, auto& o){o.emplace_back(find(x));});
739698
return out;
740699
}
741700

742-
vec<T> find_one(const py::array_t<double>& x){
701+
vec<T> find_one(const py::array_t<float>& x){
743702
const auto &buff_info_x = x.request();
744703
const auto &ndim= buff_info_x.ndim;
745704
const auto &shape_x = buff_info_x.shape;

docs/experiment.ipynb

Lines changed: 13 additions & 13 deletions
Large diffs are not rendered by default.

docs/images/fig1.png

352 Bytes
Loading

docs/images/fig2.png

352 Bytes
Loading

docs/images/fig3.png

1.06 KB
Loading

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def build_extension(self, ext):
8080

8181
setup(
8282
name='python_prtree',
83-
version='0.3.2',
83+
version='0.3.3',
8484
license='MIT',
8585
description='Python implementation of Priority R-Tree',
8686
author='atksh',

0 commit comments

Comments
 (0)