如何在使用thrust::lower_bound()计算VBO索引时克服内存限制
How to overcome memory limitation during VBO indices computation using thrust::lower_bound()
我在GPU上使用Marching cubes生成一个网格(使用CUDA)。网格非常详细,粗糙的顶点列表存储在GPU上映射到CUDA阵列float *d_vertexData
的VBO中。数据顺序为顶点位置法向交错,如下图所示。
{v0x, v0y, v0z, n0x, n0y, n0z, v1x, v1y, v1z, n1x, n1y, n1z, ...}
网格的大小通常在34MB(500K三角形)~1400MB(20M三角形)之间,存储在GPU上。
然后我使用thrust::sort()
, thrust::unique
来消除重复的顶点,并使用thrust::lower_bound()
来计算索引。在此步骤之后,网格尺寸减少了70%或更多。下面的代码演示了这一步。
float exampleVerts[36]=
{ 1, 2, 3, 0, 1, 0, 4, 5, 6, 0, 1, 0, 7, 8, 9, 0, 1, 0, 1, 2, 3, 0, 1, 0,
4, 5, 6, 0, 1, 0, 10, 11, 12, 0, 1, 0};
unsingned int numVertices = 36;
cudaMalloc(void**(&d_vertexData), numVertices*sizeof(float));
cudaMemCpy( d_vertexData, exampleVerts, numVertices*sizeof(float), cudaMemcpyHostToDevice);
unsigned int data_size = numVertices * 6; //6 floats per vertex
thrust::device_ptr<float> vertsPtr = thrust::device_pointer_cast(d_vertexData);
thrust::device_vector<float> vertsCopy(vertsPtr, vertsPtr + data_size);
thrust::device_vector<unsigned int> indices(numVertices);
auto zip_vert_first = zip(...); // using vertsPtr and strided_range
auto zip_vert_last = zip(...); // using vertsPtr and strided_range
thrust::sort(zip_verts_first, zip_verts_last);
auto new_vert_last = thrust::unique(zip_vertex_first, zip_vertex_last);
auto zip_vertcopy_first = zip(...); //using vertsCopy.data() and strided_range
auto zip_vertcopy_last = zip(...); //using vertsCopy.data() and strided_range
//find index of each input vertex in the list of unique vertices
thrust::lower_bound(zip_vert_first, new_vert_last,
zip_vertcopy_first, zip_vertcopy_last,
indices.begin());
它工作,但有相当大的内存需求。thrust::device_vector<float> vertsCopy(vertsPtr, vertsPtr + data_size);
这一行需要[VBO大小]内存来存储thrust::lower_bound()
中使用的顶点副本。
在我的应用程序中,网格通常非常大,超过1.5GB的粗糙顶点列表。这个方法有以下限制:
它需要额外117%的VBO大小。(100%复制所有顶点,17%复制索引)
由于此限制,此方法不能在具有2GB或更低VRAM的GPU上运行。我正在使用具有4GB VRAM的GPU,即使这样,我也很容易在我的应用程序中达到此限制。
有没有其他的方法来计算GPU上的索引没有这个巨大的内存需求?否则我唯一的选择是回到CPU(主机),我认为这将是非常缓慢的。
如果你操作的是索引而不是顶点数据本身,你可以避免顶点的复制。
下面的例子(基于我对你上一个问题的回答和我在这里的回答)做了以下步骤:
- 将顶点和索引一次排序
- 查找重复顶点的起始索引
- 基于这些起始索引删除重复的顶点
- 计算新指标
最终索引存储在d_indices_2
中。
d_vertices: 1 2 3 4 5 6 7 8 9 4 5 6 7 8 9 0 1 2
d_indices: 0 1 2 3 4 5
d_vertices: 0 1 2 1 2 3 4 5 6 4 5 6 7 8 9 7 8 9
d_indices: 5 0 1 3 2 4
d_indices_2: 0 1 2 0 4 0
d_vertices: 0 1 2 1 2 3 4 5 6 7 8 9
d_indices_3: 0 1 2 2 3 3
d_indices_2: 1 2 3 2 3 0
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/copy.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/scan.h>
#include <iostream>
#include <thrust/tuple.h>
#include <thrust/execution_policy.h>
#include <thrust/scatter.h>
#include <thrust/unique.h>
#include <thrust/remove.h>
#include <stdint.h>
template<typename... Iterators>
__host__ __device__
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
{
return thrust::make_zip_iterator(thrust::make_tuple(its...));
}
template <typename Iterator, typename thrust::iterator_difference<Iterator>::type stride>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
//template <difference_type stride>
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last)
: first(first), last(last) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor()));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
};
template<typename, typename>
struct append_to_type_seq { };
template<typename T, typename... Ts, template<typename...> class TT>
struct append_to_type_seq<T, TT<Ts...>>
{
using type = TT<Ts..., T>;
};
template<typename T, unsigned int N, template<typename...> class TT>
struct repeat
{
using type = typename
append_to_type_seq<
T,
typename repeat<T, N-1, TT>::type
>::type;
};
template<typename T, template<typename...> class TT>
struct repeat<T, 0, TT>
{
using type = TT<>;
};
template<typename Tuple> struct std_to_thrust_tuple;
template<typename...T> struct std_to_thrust_tuple<std::tuple<T...>> {
using type = thrust::tuple<T...>;
};
template<typename IteratorType, std::size_t stride>
class zipped_strided_range
{
public:
typedef typename strided_range<IteratorType, stride>::iterator SingleIterator;
typedef typename repeat<SingleIterator, stride, std::tuple>::type StdIteratorTuple;
typedef typename std_to_thrust_tuple<StdIteratorTuple>::type IteratorTuple;
typedef decltype(thrust::make_zip_iterator(IteratorTuple())) ZipIterator;
zipped_strided_range(IteratorType first, IteratorType last) : first(first), last(last)
{
assign<0>();
}
ZipIterator begin() const
{
return thrust::make_zip_iterator(begin_tuple);
}
ZipIterator end() const
{
return thrust::make_zip_iterator(end_tuple);
}
protected:
template <std::size_t index>
void assign(typename std::enable_if< (index < stride) >::type* = 0)
{
strided_range<IteratorType,stride> strided_range_iterator(first+index, last-(stride-1)+index);
thrust::get<index>(begin_tuple) = strided_range_iterator.begin();
thrust::get<index>(end_tuple) = strided_range_iterator.end();
assign<index+1>();
}
template <std::size_t index>
void assign(typename std::enable_if< (index == stride) >::type* = 0)
{
// end recursion
}
IteratorType first;
IteratorType last;
IteratorTuple begin_tuple;
IteratorTuple end_tuple;
};
#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
std::cout << name << ":t";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "t"));
std::cout << std::endl;
}
template <typename IteratorType, typename IndexType = uint32_t>
struct my_scatter : public thrust::unary_function<IndexType,IndexType>
{
my_scatter(IteratorType first) : first(first)
{
}
__host__ __device__
IndexType operator()(const IndexType& i)
{
IndexType result = i;
if (i > static_cast<IndexType>(0) && *(first+i) == *(first+i-static_cast<IndexType>(1)))
{
result = static_cast<IndexType>(0);
}
return result;
}
IteratorType first;
};
template <typename IteratorType>
my_scatter<IteratorType> make_my_scatter(IteratorType first)
{
return my_scatter<IteratorType>(first);
}
template <typename T>
struct my_transformer : public thrust::unary_function<T,T>
{
__host__ __device__
T operator()(const T& x) const
{
return static_cast<bool>(x);
}
};
int main()
{
using namespace thrust::placeholders;
const int stride = 3;
const int num = 6;
const int size = stride * num;
float values[size] = {1,2,3,
4,5,6,
7,8,9,
4,5,6,
7,8,9,
0,1,2
};
typedef uint32_t Integer;
thrust::host_vector<float> h_vertices (values, values+size);
thrust::device_vector<float> d_vertices = h_vertices;
float* dev_ptr = thrust::raw_pointer_cast(d_vertices.data());
zipped_strided_range<float*, stride> zipped(dev_ptr, dev_ptr+size);
thrust::device_vector<Integer> d_indices(num);
thrust::sequence(d_indices.begin(), d_indices.end());
PRINTER(d_vertices);
PRINTER(d_indices);
// 1. sort
auto zip_begin = zip(zipped.begin(),d_indices.begin());
auto zip_end = zip(zipped.end(),d_indices.end());
thrust::sort(thrust::device, zip_begin, zip_end);
PRINTER(d_vertices);
PRINTER(d_indices);
thrust::device_vector<Integer> d_indices_2(num);
// 2. find start indics of duplicate vertices
auto my_scatter_op = make_my_scatter(zipped.begin());
thrust::transform(thrust::make_counting_iterator(static_cast<Integer>(0)),
thrust::make_counting_iterator(static_cast<Integer>(num)),
d_indices_2.begin(),
my_scatter_op);
PRINTER(d_indices_2);
// 3. remove duplicate vertices
/*
// unique could be used, but we already know which vertices we want
auto new_end = thrust::unique(thrust::device, zipped.begin(), zipped.end());
*/
auto new_end = thrust::remove_if(thrust::device, zipped.begin()+1, zipped.end()+1, d_indices_2.begin()+1, !_1);
int new_size = (new_end - zipped.begin());
d_vertices.resize(stride*new_size);
PRINTER(d_vertices);
thrust::device_vector<Integer> d_indices_3(num);
auto transform_op = my_transformer<Integer>();
auto t_b = thrust::make_transform_iterator(d_indices_2.begin()+1, transform_op);
auto t_e = thrust::make_transform_iterator(d_indices_2.end(), transform_op);
thrust::inclusive_scan(t_b, t_e, d_indices_3.begin()+1);
PRINTER(d_indices_3);
// 4. calculate final indices
thrust::scatter(d_indices_3.begin(), d_indices_3.end(), d_indices.begin(), d_indices_2.begin());
PRINTER(d_indices_2);
return 0;
}
- 数组索引的值没有增加
- 芬威克树(BIT).找到具有给定累积频率的最小索引,单位为 O(logN)
- 查找最接近的大于当前数字的数字的索引
- 在C++中调整向量中的索引
- 重载元组索引运算符-C++
- 给定一个向量,如何找到该向量的所有子集和的原始索引
- 为std::string的某个索引赋值
- 并行用于C++17中数组索引范围内的循环
- 跟随整数索引列表的自定义类迭代器
- 如何在for循环中包含两个索引值的测试条件
- D3D11-将混合权重和索引传递到顶点着色器
- 将转换字符键入 int 以用作向量C++的索引
- 在 C++ 中访问数组负索引处的内存不会返回垃圾
- 如何为圆环创建索引
- 在子集化后将包含索引号的列表列表映射到标准索引序列
- 查找字符在两个索引之间出现的次数
- 可以在同一 VBO 中存储不同的顶点属性和索引
- 使用纹理,法线和索引列表从VBO绘制OpenGL对象的问题
- OpenGL索引VBO渲染
- 如何在使用thrust::lower_bound()计算VBO索引时克服内存限制