c++多线程读取文件

c++ reading a file multiple threads

本文关键字:文件 读取 多线程 c++      更新时间:2023-10-16

我有一个大文件,超过8900万行。我想读取一个文件,将其转换为哈希表,然后进行一些计算。

问题是使用istream读取文件并将其传递到哈希表的速度太慢。

是否有可能使用更多线程读取文件?使用线程库?

或者我应该把锉刀切成小块,然后每一块都用一根线?

散列函数的计算不需要花费太多时间。

对于冲突,我使用列表。表格的数字是100万。

// Adding_date_too_file.cpp : This file contains the 'main' function. Program execution begins and ends there.
//
#include "pch.h"
#include <iostream>
#include <string>
#include "hash.h"
#include <iostream>
#include <fstream>
using namespace std;
int main()
{
hasho Hashy;
string f1, f2, f3, f4, f5, f6, f7;
bool is_first_line = true;
fstream file_input;
fstream  file_2_Collums;
cout << "Please give the name of the file that you want to run: n(The file should end with the format type (txt,csv etc.)) and it has to have the first column sorted !! (file with only two columnn which is going to be used for searching based on that file)" << flush;
while (true)
{
string infilename;
getline(cin, infilename);
file_input.open(infilename.c_str());
if (file_input)break;
cout << "Invalid file. Please enter a valid input file name> " << flush;
}

cout << "Please give the name of the file that you want to run: n(The file should end with the format type (txt,csv etc.)) and it has to have the first column sorted !! (file with only one column )" << flush;
while (true)
{
string infilename;
getline(cin, infilename);
file_2_Collums.open(infilename.c_str());
if (file_2_Collums)break;
cout << "Invalid file. Please enter a valid input file name> " << flush;
}
//creating output file

int * table;
table = new int[2];
int count_file_lines = 0;
int line_counter_inventors = 0;
if (file_input.is_open())
{
while (!file_input.eof())
{
if (is_first_line == true) {
getline(file_input, f1, 'n');
is_first_line = false;
}

getline(file_input, f1, 't');// patent id
getline(file_input, f2, 't');// patent id
getline(file_input, f3, 't');// patent id
getline(file_input, f3, 't');// patent id
getline(file_input, f6, 't');// patent id
getline(file_input, f3, 'n');//date

//cout << "adding these items " << f1 << 't' << f6 << endl;
Hashy.AddItem(f2, f6);
cout << count_file_lines << endl;
count_file_lines++;
//  cout << f2 << 't' << f6 << endl;
}
}
int lines_2 = 0;
if (file_2_Collums.is_open())
{
Hashy.openOutputFile();
while (!file_2_Collums.eof())
{
getline(file_2_Collums, f1, 't');//patent_id
getline(file_2_Collums, f4, 'n');//assignee_id
//cout << f1 << endl;

Hashy.FindDateId(f1, f4);
lines_2++;
}
}


system("pause");
return 0;}

Hash.cpp

#include "pch.h"
#include <iostream>
#include <string>
#include "hash.h"
#include "hash.h"
#include <fstream>
using namespace std;
static ofstream output_file;
hasho::hasho()
{
for (int i = 0; i < tableSize; i++) {
//cout << i << endl;
HashTable[i] = new item;
HashTable[i]->pattent_id = "empty";
HashTable[i]->date = "empty";
HashTable[i]->next = NULL;
}
}
void hasho::openOutputFile() {
cout << "Please give the name of the output file: n(The file should end with the format type (txt,csv etc.)) " << flush;
while (true)
{
string infilename;
getline(cin, infilename);
output_file.open(infilename.c_str(), fstream::out);
break;
}
}
int hasho::NumberOfItemsInIndex(int index) {
int count = 0;
if (HashTable[index]->date == "empty") {
return count;
}
else {
count++;
item* ptr = HashTable[index];
while (ptr->next != NULL) {
count++;
ptr = ptr->next;
}
}
return count;
}
void hasho::PrintTable() {
int number;
for (int i = 0; i < tableSize; i++) {

number = NumberOfItemsInIndex(i);
cout << "---------------------------------------n";
cout << "index= " << i << endl;
cout << HashTable[i]->pattent_id << endl;
cout << HashTable[i]->date << endl;
cout << "# of items= " << number << endl;
cout << "---------------------------------------n";
}
}

void hasho::PrintItemsInIndex(int index) {

item* ptr = HashTable[index];
if (ptr->date == "empty") {
cout << "index  = " << index << " is empty." << endl;
}
else {
cout << "index = " << index << " contains the following itemsn";
while (ptr != NULL) {

cout << "-----------" << endl;
cout << ptr->date << endl;
cout << ptr->pattent_id << endl;
cout << "-----------" << endl;
ptr = ptr->next;
}
}
}

void hasho::AddItem(string pattend_id, string date)
{
int index = Hash(pattend_id);
if (HashTable[index]->pattent_id == "empty")
{
HashTable[index]->pattent_id = pattend_id;
HashTable[index]->date = date;
}
else {
item* ptr = HashTable[index];
item* n = new item;
n->date = date ;
n->pattent_id = pattend_id;
n->next = NULL;
while (ptr->next != NULL) {
ptr = ptr->next;
}
ptr->next = n;
}
}
void hasho::FindDateId(string pattend_id, string assignee_id1) {
int found = 0;
int nfound = 0;
int index = Hash(pattend_id);
bool foundDateId = false;
string date;
item* ptr = HashTable[index];
int count = 1;
//write to file
//cout << "WE are looking for the date of " <<pattend_id << " in Index:  " << index <<endl;
while (ptr != NULL) {
//cout << "NOw we are looking with : " << pattend_id << endl;
if (ptr->pattent_id == pattend_id) {
//cout << "NOw we are looking with : " << pattend_id <<endl;

foundDateId = true;
date = ptr->date;
//write to file 

output_file << pattend_id << "t";
output_file << assignee_id1 << endl;
output_file << date << "t";
//cout << " " << date << endl;
found = 1;
count++;
}
ptr = ptr->next;
}
if (foundDateId == false) {
nfound++;

}
cout << "found " << found << endl;
cout << "not found " << nfound << endl;
cout << endl;
}
int hasho::Hash(string key)
{
int unsigned hash = 0;
int  unsigned index;
//cout << key << endl;
for (int unsigned i = 0; i < key.length(); i++) {
hash = hash + (int)key[i] *(i+1);
}
index =hash % tableSize;
//cout << index << endl;
return index;

}

Hash.h

#pragma once
#include "pch.h"
#include <iostream>
#include <string>
//#include "hash.cpp"
using namespace std;
#pragma comment(linker, "/STACK:7000000")
#pragma comment(linker, "/HEAP:7000000")
#ifndef  HASH_H
#define HASH_H

class hasho {
private:
static const int tableSize = 300003;
struct item {
string pattent_id;
string date;
item* next;
};
item* HashTable[tableSize];

public:
hasho();
int Hash(string key);
void AddItem(string pattend_id, string date);
int NumberOfItemsInIndex(int index);
void PrintTable();
void PrintItemsInIndex(int index);
void FindDateId(string pattent_id, string assgnee_id);
void openOutputFile();
};

#endif // ! HASH_H

我有一个超过8900万行的大文件

如果你想用几个线程来处理它,你可能不应该这样做。你应该解释这个巨大的文件包含什么(什么样的数据:基因组学、时间序列…)及其大小(以GB为单位)。您处理同一个文件一次还是多次?你需要多少时间(以时间(1)衡量)来处理它?wc(1)需要多少时间来计算行数?

一种可能性是将该文件拆分为几个较小的文件(例如,使用split(1))由整行组成,并用这些较小的文件为程序提供信息。我不知道这是否会对你有所帮助(可能不会,除非你多次运行程序来读取这些文件)。

另一种可能是在文件上进行两次。第一次通过将对行进行计数,并且可能记住其中一些行(例如,每1024行)的起始偏移。然后,您可以在第二次传递中并行处理文件(通过重用记住的偏移量)

顺便说一句,如果你的大文件太大了,以至于它没有留在页面缓存中,那么你的问题就是IO限制(瓶颈是物理磁盘硬件),你不会通过尝试并行化它(即使将它拆分成更小的文件)来获得任何速度。

一种可能性可能是读取并解析一次(缓慢地)您的巨大文件,然后用它的数据填充一些数据库(可能是sqlite数据库)。然后,您可能会(如果您多次处理该数据)利用访问该数据库(而不是该文件)的优势。

关于哈希表,请考虑使用标准C++容器(例如std::unordered_map)。

PS。我们不知道那个巨大的文件是什么,它包含什么,以及你如何处理它