C:pthread的性能，比单线程低

C: performance of pthread, low than single thrad

本文关键字：单线程性能 pthread 更新时间：2023-10-16

我对我的代码的性能感到困惑，在处理单线程时，它只使用 13 秒，但它会消耗 80 秒。我不知道矢量是否一次只能由一个线程访问，如果是这样，我可能必须使用结构数组而不是矢量来存储数据，有人可以帮忙吗？

#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <iterator>
#include <string>
#include <ctime>
#include <bangdb/database.h>
#include "SEQ.h"
#define NUM_THREADS 16
using namespace std;

typedef struct _thread_data_t {
std::vector<FDT> *Query;
unsigned long start;
unsigned long end;
connection* conn;
int thread;
} thread_data_t;

void *thr_func(void *arg) {
thread_data_t *data = (thread_data_t *)arg;
std::vector<FDT> *Query = data->Query;
unsigned long start = data->start;
unsigned long end = data->end;
connection* conn = data->conn;
printf("thread %d started %lu -> %lun", data->thread, start, end);
for (unsigned long i=start;i<=end ;i++ )
{
FDT *fout = conn->get(&((*Query).at(i)));
if (fout == NULL)
{
//printf("%stNULLn", s);
}
else
{
printf("Thread:%dt%sn", data->thread, fout->data);
}
}
pthread_exit(NULL);
}

int main(int argc, char *argv[])
{
if (argc<2)
{
printf("USAGE: ./seq <.txt>n");
printf("/home/rd/SCRIPTs/12X18610_L5_I052.R1.clean.code.seqn");
exit(-1);
}
printf("%sn", argv[1]);
vector<FDT> Query;
FILE* fpin;
if((fpin=fopen(argv[1],"r"))==NULL)  {
printf("Can't open Input file %sn", argv[1]);
return -1; 
}
char *key = (char *)malloc(36);
while (fscanf(fpin, "%s", key) != EOF)
{
SEQ * sequence = new SEQ(key);
FDT *fk = new FDT( (void*)sequence, sizeof(*sequence) );
Query.push_back(*fk);
}
unsigned long Querysize = (unsigned long)(Query.size());
std::cout << "myvector stores " << Querysize << " numbers.n";

//create database, table and connection
database* db = new database((char*)"berrydb");
//get a table, a new one or existing one, walog tells if log is on or off
table* tbl = db->gettable((char*)"hg19", JUSTOPEN);
if(tbl == NULL)
{
printf("ERROR:table NULL error");
exit(-1);
}
//get a new connection
connection* conn = tbl->getconnection();
if(conn == NULL)
{
printf("ERROR:connection NULL error");
exit(-1);
}
cerr<<"begin querying...n";

time_t begin, end;
double duration;
begin = clock();


unsigned long ThreadDealSize = Querysize/NUM_THREADS;
cerr<<"Querysize:"<<ThreadDealSize<<endl;

pthread_t thr[NUM_THREADS];
int rc;
thread_data_t thr_data[NUM_THREADS];
for (int i=0;i<NUM_THREADS ;i++ )
{
unsigned long ThreadDealStart = ThreadDealSize*i;
unsigned long ThreadDealEnd   = ThreadDealSize*(i+1) - 1;
if (i == (NUM_THREADS-1) )
{
ThreadDealEnd = Querysize-1;
}
thr_data[i].conn = conn;
thr_data[i].Query = &Query;
thr_data[i].start = ThreadDealStart;
thr_data[i].end = ThreadDealEnd;
thr_data[i].thread = i;
}

for (int i=0;i<NUM_THREADS ;i++ )
{
if (rc = pthread_create(&thr[i], NULL, thr_func, &thr_data[i]))
{
fprintf(stderr, "error: pthread_create, rc: %dn", rc);
return EXIT_FAILURE;
}
}

for (int i = 0; i < NUM_THREADS; ++i) {
pthread_join(thr[i], NULL);
}

cerr<<"donen"<<endl;
end = clock();
duration = double(end - begin) / CLOCKS_PER_SEC;
cerr << "runtime:   " << duration << "n" << endl;
db->closedatabase(OPTIMISTIC);
delete db;
printf("Donen");

return EXIT_SUCCESS;
}

与标准库中的所有数据结构一样，vector方法是可重入的，但不是线程安全的。这意味着多个线程可以独立访问不同的实例，但每个实例一次只能由一个线程访问，您必须确保这一点。但是由于每个线程都有单独的向量，因此这不是您的问题。

可能是你的问题是printf.printf是线程安全的，这意味着您可以同时从任意数量的线程调用它，但代价是在内部包装在互斥中。

程序线程部分的大部分工作都是在printf中完成的。所以可能发生的情况是所有线程都启动并快速到达printf，除了第一个线程之外，所有线程都将停止。当 printf 完成并释放互斥锁时，系统会考虑调度等待它的线程。它可能会发生，所以会发生相当慢的上下文切换。并在每次printf后重复.

它究竟如何发生取决于所使用的实际锁定原语，这取决于您的操作系统和标准库版本。系统每次应该只唤醒下一个睡眠者，但许多实现实际上唤醒了所有这些睡眠者。因此，除了以循环方式执行printf之外，每个都会导致一个上下文切换，可能还有相当多的额外虚假唤醒，其中线程只是发现锁被持有并返回睡眠状态。

所以从中得到的教训是，线程不会让事情自动变快。它们仅在以下情况下提供帮助：

线程将大部分时间花在阻止系统调用上。在网络服务器之类的东西中，线程等待来自套接字的数据，而不是从来自磁盘的响应，最后等待网络接受响应。在这种情况下，只要它们大多是独立的，拥有许多线程就会有所帮助。
只有这么多线程，因为有 CPU 线程。目前通常的数字是 4(四核或双核超线程)。更多的线程无法在物理上并行运行，因此它们不提供增益并产生一些开销。因此，16 个线程是矫枉过正的。

当他们都操纵相同的对象时，它们永远不会提供帮助，因此无论如何，它们最终都会花费大部分时间等待锁。除了锁定的任何自己的对象之外，请记住，输入和输出文件句柄也必须在内部锁定。

内存分配也需要在线程之间进行内部同步，但现代分配器为线程提供了单独的池以避免大量线程;如果默认分配器被证明在很多线程中太慢，则可以使用一些专用的分配器。