C++读取大数据，解析，然后写入数据

C++ Read large data, parse, then write data

本文关键字：数据然后解析读取 C++ 更新时间：2023-10-16

我正在尝试读取一个大型数据集，按照我需要的方式对其进行格式化，然后将其写入另一个文件。我试图使用C++而不是SAS或STATA来获得速度优势。数据文件通常在10GB左右。我当前的代码需要一个多小时才能运行（然后我就把它杀死了，因为我确信我的代码效率很低

有没有更有效的方法可以做到这一点？也许可以将文件读入内存，然后使用switch语句进行分析？（我有32gb ram linux 64位）。有没有可能在循环中先读后写会减慢速度，因为它一直在读，然后写？我试着从一个驱动器读取它，然后写到另一个驱动器，试图加快速度。

开关箱会减慢速度吗？

我现在使用getline读取数据，使用switch语句正确解析数据，然后将其写入outfile。并重复3亿行。switch语句中还有大约10个案例，但为了简洁起见，我没有复制。

代码可能非常丑陋，全部都在主函数中，但我想在研究吸引力之前让它发挥作用。

我试过使用read（），但没有成功。如果我需要澄清什么，请告诉我。

谢谢你的帮助！

 #include <iostream>
 #include <fstream>
 #include <string>
 #include <sstream>
 #include <stdio.h>
 //#include <cstring>
 //#include <boost/algorithm/string.hpp>
 #include <vector>
  using namespace std;
 //using namespace boost;

 struct dataline
{
char type[0];
double second;
short mill;
char event[1];
char ticker[6];
char marketCategory[1];
char financialStatus[1];
int roundLotSize;
short roundLotOnly;
char tradingState[1];
char reserved[1];
char reason[4];
char mpid[4];
char primaryMarketMaker[1];
char primaryMarketMode[1];
char marketParticipantState[1];
unsigned long orderNumber;
char buySell[0];
double shares;
float price;
int executedShares;
double matchNumber;
char printable[1];
double executionPrice;
int canceledShares;
double sharesBig;
double crossPrice;
char crossType[0];
double pairedShares;
double imbalanceShares;
char imbalanceDirection[1];
double fairPrice;
double nearPrice;
double currentReferencePrice;
char priceVariationIndicator[1];
};
  int main () 
{
string a; 
string b;
string c;
string d;
string e;
string f;
string g;
string h;
string k;
string l;
string times;
string smalltimes;
short time;     //counter to keep second filled
short smalltime;    //counter to keep millisecond filled
double N;
double NN;
double NNN;
int length;
char M; 
//vector<> fout;
string line;
ofstream fout ("/media/3tb/test.txt");
ifstream myfile;
myfile.open("S050508-v3.txt");
dataline oneline;
if (myfile.is_open())
    {
    while ( myfile.good() )
        {
        getline (myfile,line);
//      cout << line<<endl;;
        a=line.substr(0,1);
        stringstream ss(a);
        char type;
        ss>>type;

        switch (type)
            { 
            case 'T':
                {
                if (type == 'T')
                    {
                    times=line.substr(1,5);
                    stringstream s(times);
                    s>>time;
                    //oneline.second=time;
                    //oneline.second;
                    //cout<<time<<endl;
                    }
                else
                    {
                    time=time;
                    }
                break;
                }
            case 'M':
                {
                if (type == 'M')
                    {
                    smalltimes=line.substr(1,3);
                    stringstream ss(smalltimes);
                    ss>>smalltime;      //oneline.mill;
                //  cout<<smalltime<<endl;                            //smalltime=oneline.mill;
                    }
                else
                    {
                    smalltime=smalltime;
                    }
                break;
                }

            case 'R':
                {
                oneline.second=time;
                oneline.mill=smalltime;
                a=line.substr(0,1);
                stringstream ss(a);
                ss>>oneline.type;
                b=line.substr(1,6);
                stringstream sss(b);
                sss>>oneline.ticker;
                c=line.substr(7,1);
                stringstream ssss(c);
                ssss>>oneline.marketCategory;
                d=line.substr(8,1);
                stringstream sssss(d);
                sssss>>oneline.financialStatus;
                e=line.substr(9,6);
                stringstream ssssss(e);
                ssssss>>oneline.roundLotSize;
                f=line.substr(15,1);
                stringstream sssssss(f);
                sssssss>>oneline.roundLotOnly;
                *oneline.tradingState=0;
                *oneline.reserved=0;
                *oneline.reason=0;
                *oneline.mpid=0;
                *oneline.primaryMarketMaker=0;
                *oneline.primaryMarketMode=0;
                *oneline.marketParticipantState=0;
                oneline.orderNumber=0;
                *oneline.buySell=0;
                oneline.shares=0;
                oneline.price=0;
                oneline.executedShares=0;
                oneline.matchNumber=0;
                *oneline.printable=0;
                oneline.executionPrice=0;
                oneline.canceledShares=0;
                oneline.sharesBig=0;
                oneline.crossPrice=0;
                *oneline.crossType=0;
                oneline.pairedShares=0;
                oneline.imbalanceShares=0;
                *oneline.imbalanceDirection=0;
                oneline.fairPrice=0;
                oneline.nearPrice=0;
                oneline.currentReferencePrice=0;
                *oneline.priceVariationIndicator=0;
                break;
                }//End Case 
            }//End Switch
            }//end While
    myfile.close();
     }//End If
else cout << "Unable to open file"; 
cout<<"Junk"<<endl;
return 0;
}

UPDATE所以我一直在尝试使用内存映射，但现在我遇到了分段错误。我一直在尝试以不同的例子拼凑出对我有用的东西。为什么我会出现分段错误？我已经获取了代码的第一部分，它看起来像这样：

int main (int argc, char** path) 
 {
 long i;
 int fd;
 char *map;
 char *FILEPATH = path;
 unsigned long FILESIZE;
 FILE* fp = fopen(FILEPATH, "/home/brian/Desktop/S050508-v3.txt");
 fseek(fp, 0, SEEK_END);
 FILESIZE = ftell(fp);
 fseek(fp, 0, SEEK_SET);
 fclose(fp);
 fd = open(FILEPATH, O_RDONLY);
 map = (char *) mmap(0, FILESIZE, PROT_READ, MAP_SHARED, fd, 0);
 char z;
 stringstream ss;
 for (long i = 0; i <= FILESIZE; ++i) 
    {
    z = map[i];
    if (z != 'n') 
        {
        ss << z;
            }
    else 
        {
            // c style tokenizing
            ss.str("");
            }
        }
 if (munmap(map, FILESIZE) == -1) perror("Error un-mmapping the file");
 close(fd);

数据文件通常在10GB左右。。。。开关箱会减慢速度吗？

几乎可以肯定不是，闻起来像是被I/O绑定了。但你应该考虑衡量它。现代CPU有性能计数器，用合适的工具很容易利用。但是，让我们开始将问题划分为一些主要领域：设备的I/O、内存的加载/存储、CPU。您可以在代码中读取时钟的位置放置一些标记，以了解每个操作需要多长时间。在linux上，您可以使用clock_gettime()或rdtsc指令访问精度高于操作系统刻度的时钟。

考虑mmap/CreateFileMapping，它们中的任何一个都可能为您正在访问的页面提供更好的效率/吞吐量。

如果通过已经分页的大量数据进行流式传输，请考虑大/大页面。

来自mmap():手册

说明

mmap（）在调用进程。指定了新映射的起始地址地址中。length参数指定映射的长度。

以下是mmap()示例：

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#define FILEPATH "/tmp/mmapped.bin"
#define NUMINTS  (1000)
#define FILESIZE (NUMINTS * sizeof(int))
int main(int argc, char *argv[])
{
    int i;
    int fd;
    int *map;  /* mmapped array of int's */
    fd = open(FILEPATH, O_RDONLY);
    if (fd == -1) {
    perror("Error opening file for reading");
    exit(EXIT_FAILURE);
    }
    map = mmap(0, FILESIZE, PROT_READ, MAP_SHARED, fd, 0);
    if (map == MAP_FAILED) {
    close(fd);
    perror("Error mmapping the file");
    exit(EXIT_FAILURE);
    }
    /* Read the file int-by-int from the mmap
     */
    for (i = 1; i <=NUMINTS; ++i) {
    printf("%d: %dn", i, map[i]);
    }
    if (munmap(map, FILESIZE) == -1) {
    perror("Error un-mmapping the file");
    }
    close(fd);
    return 0;
}