c++中的加权方差和加权标准差
Weighted Variance and Weighted Standard Deviation in C++
你好,我试图计算一系列整数或浮点数的加权方差和加权标准差。我找到了这些链接:
http://math.tutorvista.com/statistics/standard-deviation.html weighted-standard-deviation
http://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weightsd.pdf(警告pdf)
以下是目前为止我的模板函数。方差和标准偏差工作得很好,但对于我来说,我无法获得加权版本以匹配pdf底部的测试用例:template <class T>
inline float Mean( T samples[], int count )
{
float mean = 0.0f;
if( count >= 1 )
{
for( int i = 0; i < count; i++ )
mean += samples[i];
mean /= (float) count;
}
return mean;
}
template <class T>
inline float Variance( T samples[], int count )
{
float variance = 0.0f;
if( count > 1 )
{
float mean = 0.0f;
for( int i = 0; i < count; i++ )
mean += samples[i];
mean /= (float) count;
for( int i = 0; i < count; i++ )
{
float sum = (float) samples[i] - mean;
variance += sum*sum;
}
variance /= (float) count - 1.0f;
}
return variance;
}
template <class T>
inline float StdDev( T samples[], int count )
{
return sqrtf( Variance( samples, count ) );
}
template <class T>
inline float VarianceWeighted( T samples[], T weights[], int count )
{
float varianceWeighted = 0.0f;
if( count > 1 )
{
float sumWeights = 0.0f, meanWeighted = 0.0f;
int numNonzero = 0;
for( int i = 0; i < count; i++ )
{
meanWeighted += samples[i]*weights[i];
sumWeights += weights[i];
if( ((float) weights[i]) != 0.0f ) numNonzero++;
}
if( sumWeights != 0.0f && numNonzero > 1 )
{
meanWeighted /= sumWeights;
for( int i = 0; i < count; i++ )
{
float sum = samples[i] - meanWeighted;
varianceWeighted += weights[i]*sum*sum;
}
varianceWeighted *= ((float) numNonzero)/((float) count*(numNonzero - 1.0f)*sumWeights); // this should be right but isn't?!
}
}
return varianceWeighted;
}
template <class T>
inline float StdDevWeighted( T samples[], T weights[], int count )
{
return sqrtf( VarianceWeighted( samples, weights, count ) );
}
测试用例:int samples[] = { 2, 3, 5, 7, 11, 13, 17, 19, 23 };
printf( "%.2fn", StdDev( samples, 9 ) );
int weights[] = { 1, 1, 0, 0, 4, 1, 2, 1, 0 };
printf( "%.2fn", StdDevWeighted( samples, weights, 9 ) );
结果:7.46
1.94
应:7.46
5.82
我认为问题是加权方差有几个不同的解释,我不知道哪一个是标准的。我发现了这个变化:
http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance Weighted_incremental_algorithm
template <class T>
inline float VarianceWeighted( T samples[], T weights[], int count )
{
float varianceWeighted = 0.0f;
if( count > 1 )
{
float sumWeights = 0.0f, meanWeighted = 0.0f, m2 = 0.0f;
for( int i = 0; i < count; i++ )
{
float temp = weights[i] + sumWeights,
delta = samples[i] - meanWeighted,
r = delta*weights[i]/temp;
meanWeighted += r;
m2 += sumWeights*delta*r; // Alternatively, m2 += weights[i] * delta * (samples[i]−meanWeighted)
sumWeights = temp;
}
varianceWeighted = (m2/sumWeights)*((float) count/(count - 1));
}
return varianceWeighted;
}
结果:7.46
5.64
我也试着看看boost和esutil,但它们没有多大帮助:
http://www.boost.org/doc/libs/1_48_0/boost/accumulators/statistics/weighted_variance.hpphttp://esutil.googlecode.com/svn-history/r269/trunk/esutil/stat/util.py
我不需要一个完整的统计库,更重要的是,我想了解实现。
是否有人可以发布函数来正确计算这些?
如果你的函数可以一次完成,那就加分了。
另外,有人知道加权方差是否与重复值的普通方差给出相同的结果吗?例如,样本[]={1,2,3,3}的方差是否与样本[]={1,2,3},权重[]={1,1,2}的加权方差相同?
更新:这是我设置的谷歌文档电子表格来探索这个问题。不幸的是,我的答案与NIST pdf相差甚远。我认为问题出在无偏见步骤上,但我不知道如何解决。
https://docs.google.com/spreadsheet/ccc?key=0ApzPh5nRin0ldGNNYjhCUTlWTks2TGJrZW4wQUcyZnc& usp =分享
结果是加权方差为3.77,这是我在c++代码中得到的加权标准差1.94的平方。
我试图在我的Mac OS X安装octave,这样我就可以用权重运行他们的var()函数,但是用brew安装它需要很长时间。我现在深深地爱上了剃牦牛毛。
float mean(uint16_t* x, uint16_t n) {
uint16_t sum_xi = 0;
int i;
for (i = 0; i < n; i++) {
sum_xi += x[i];
}
return (float) sum_xi / n;
}
/**
* http://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weigmean.pdf
*/
float weighted_mean(uint16_t* x, uint16_t* w, uint16_t n) {
int sum_wixi = 0;
int sum_wi = 0;
int i;
for (i = 0; i < n; i++) {
sum_wixi += w[i] * x[i];
sum_wi += w[i];
}
return (float) sum_wixi / (float) sum_wi;
}
float variance(uint16_t* x, uint16_t n) {
float mean_x = mean(x, n);
float dist, dist2;
float sum_dist2 = 0;
int i;
for (i = 0; i < n; i++) {
dist = x[i] - mean_x;
dist2 = dist * dist;
sum_dist2 += dist2;
}
return sum_dist2 / (n - 1);
}
/**
* http://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weighvar.pdf
*/
float weighted_variance(uint16_t* x, uint16_t* w, uint16_t n) {
float xw = weighted_mean(x, w, n);
float dist, dist2;
float sum_wi_times_dist2 = 0;
int sum_wi = 0;
int n_prime = 0;
int i;
for (i = 0; i < n; i++) {
dist = x[i] - xw;
dist2 = dist * dist;
sum_wi_times_dist2 += w[i] * dist2;
sum_wi += w[i];
if (w[i] > 0)
n_prime++;
}
if (n_prime > 1) {
return sum_wi_times_dist2 / ((float) ((n_prime - 1) * sum_wi) / n_prime);
} else {
return 0.0f;
}
}
/**
* http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Weighted_incremental_algorithm
*/
float weighted_incremental_variance(uint16_t* x, uint16_t* w, uint16_t n) {
uint16_t sumweight = 0;
float mean = 0;
float M2 = 0;
int n_prime = 0;
uint16_t temp;
float delta;
float R;
int i;
for (i = 0; i < n; i++) {
if (w[i] == 0)
continue;
temp = w[i] + sumweight;
delta = x[i] - mean;
R = delta * w[i] / temp;
mean += R;
M2 += sumweight * delta * R;
sumweight = temp;
n_prime++;
}
if (n_prime > 1) {
float variance_n = M2 / sumweight;
return variance_n * n_prime / (n_prime - 1);
} else {
return 0.0f;
}
}
void main(void) {
uint16_t n = 9;
uint16_t x[] = { 2, 3, 5, 7, 11, 13, 17, 19, 23 };
uint16_t w[] = { 1, 1, 0, 0, 4, 1, 2, 1, 0 };
printf("%fn", weighted_variance(x, w, n)); /* outputs: 33.900002 */
printf("%fn", weighted_incremental_variance(x, w, n)); /* outputs: 33.900005 */
}
解决方案
您不小心在方差更新项的分母中添加了一个额外的项"count"。
当使用下面的更正时,我得到了您期望的
答案5.82
仅供参考,当你在做代码审查时,一种选择类似事情的方法是做"维度分析"。方程式的"单位"是错误的。你实际上是除以了一个(N)的平方项而它应该是一个(N)项
之前template <class T>
inline float VarianceWeighted( T samples[], T weights[], int count )
{
...
varianceWeighted *= ((float) numNonzero)/((float) count*(numNonzero - 1.0f)*sumWeights); // this should be right but isn't?!
...
}
后删除"count"这一行应该被
取代template <class T>
inline float VarianceWeighted( T samples[], T weights[], int count )
{
...
varianceWeighted *= ((float) numNonzero)/((float) (numNonzero - 1.0f)*sumWeights); // removed count term
...
}
这是一个更短的版本,有一个工作的演示:
#include <iostream>
#include <vector>
#include <boost/accumulators/accumulators.hpp>
#include <boost/accumulators/statistics/stats.hpp>
#include <boost/accumulators/statistics/weighted_variance.hpp>
#include <boost/accumulators/statistics/variance.hpp>
namespace ba = boost::accumulators;
int main() {
std::vector<double> numbers{2, 3, 5, 7, 11, 13, 17, 19, 23};
std::vector<double> weights{1, 1, 0, 0, 4, 1, 2, 1, 0 };
ba::accumulator_set<double, ba::stats<ba::tag::variance > > acc;
ba::accumulator_set<double, ba::stats<ba::tag::weighted_variance > , double > acc_weighted;
double n = numbers.size();
double N = n;
for(size_t i = 0 ; i<numbers.size() ; i++ ) {
acc ( numbers[i] );
acc_weighted( numbers[i] , ba::weight = weights[i] );
if(weights[i] == 0) {
n=n-1;
}
};
std::cout << "Sample Standard Deviation, s: " << std::sqrt(ba::variance(acc) *N/(N-1)) << std::endl;
std::cout << "Weighted Sample Standard Deviation, s: " << std::sqrt(ba::weighted_variance(acc_weighted)*n/(n-1)) << std::endl;
}
请注意,n
必须反映非零权重的样本数,因此额外的n=n-1;
行。
Sample Standard Deviation, s: 7.45729
Weighted Sample Standard Deviation, s: 5.82237
- OpenMP阵列性能较差
- 使用CMake检测支持的C++标准
- 如何理解C++标准N3337中的expr.const.cast子句8
- "throw expression code" 1e7 >返回 d 是什么?投掷标准::overflow_error( "too big" ) : d;意味 着?
- 编译标准库类型
- 找到两对数字,使它们的乘积的绝对差最小化
- 标准是否使用多余的大括号(例如 T{{{10}}})定义列表初始化?
- 为什么constexpr的性能比正常表达式差
- 编译器如何在使用SFINAE的函数和标准函数之间确定两者是否可行
- 在不传递参数数量且只有3个点的情况下,如何使用变差函数
- 标准库容器最简单、性能差的哈希类是什么?
- C++.加权标准::随机播放
- 如何使用向量查找平均值和标准差
- 计算运行标准差
- c++中的加权方差和加权标准差
- 如何在C++或Java中计算方差、中值和标准差
- 在c++中使用Boost计算样本向量的均值和标准差
- 10个箱的快速加权平均值和方差
- C++使用std::accumulate计算不正确的标准差
- 图像的均值、标准差、方差和偏度