C 读单词读取文件,而没有任何符号

C++ read file word by word without any symbol

本文关键字:任何 符号 单词 读取 文件      更新时间:2023-10-16

我想从文本文件中读取单词。这是我在C 中的代码:

int main(int argc, const char * argv[]) {
    // insert code here...
    ifstream file("./wordCount.txt");
    string word;
    while(file >> word){
        cout<<word<<endl;
    }
    return 0;
}

文本文件包含句子:

I don't have power, but he has power.

这是我得到的结果:

I
don241257t
have
power,
but
he
has
power.

您能告诉我如何像以下格式一样获得结果:

I
don't
have
power
but
he
has
power

谢谢。

我知道您正在寻找摆脱标点符号。

不幸的是,从流中提取字符串仅查找空间作为分离器。因此,"不要"或" Hello,World"将被读为一个单词,而"不要"或" Hello,World"为两个单词。

替代方案是逐行读取文本,然后使用string::find_first_of()从分隔符跳到分隔器:

string separator{" trn,.!?;:"};
string line; 
string word;
while(getline (cin, line)){  // read line by line 
    size_t e,s=0;            // s = offset of next word, e = end of next word 
    do {
        s = line.find_first_not_of(separator,s);  // skip leading separators
        if (s==string::npos)                  // stop if no word left
            break;
        e=line.find_first_of(separator, s);   // find next separator 
        string word(line.substr(s,e-s));      // construct the word
        cout<<word<<endl;
        s=e+1;                                // position after the separator
    } while (e!=string::npos);                // loop if end of line not reached
}

在线演示

下面的代码,除了撇号外,要摆脱标点符号:

#include <iostream>
#include <fstream>
#include <string>
#include <algorithm>
using namespace std;
int main(int argc, const char * argv[]) {
    ifstream file("wordCount.txt");
    string word;
    while(file >> word) {
        for (auto c : word)
            if (ispunct(c) && c != '`')
                word.erase(word.find_first_of(c));
        cout << word << endl;
    }
    return 0;
}

应产生所需的输出:

Georgioss-MacBook-Pro:~ gsamaras$ g++ -Wall -std=c++0x main.cpp 
Georgioss-MacBook-Pro:~ gsamaras$ ./a.out 
I
don`t
have
power
but
he
has
power

对于某些字符的问题,我鼓励您检查文件的编码,因此请尝试进行(如下所示(:

file -I wordCount.txt 
wordCount.txt: text/plain; charset=us-ascii

这对我有用。或只是打开文本编辑器并确保字符有效。

为了简化调试,我将文件替换为std :: iStringstream。

  • 易于添加其他测试输入
  • 输入是记录的,可重复。

我还添加了一个布尔(类数据属性(,以简化/禁用其他诊断信息。(m_dbg(

#include <algorithm>
#include <chrono>
// 'compressed' chrono access --------------vvvvvvv
typedef std::chrono::high_resolution_clock  HRClk_t; // std-chrono-hi-res-clk
typedef HRClk_t::time_point                 Time_t;  // std-chrono-hi-res-clk-time-point
typedef std::chrono::microseconds           MS_t;    // std-chrono-milliseconds
typedef std::chrono::microseconds           US_t;    // std-chrono-microseconds
typedef std::chrono::nanoseconds            NS_t;    // std-chrono-nanoseconds
using   namespace std::chrono_literals;          // support suffixes like 100ms, 2s, 30us
#include <iostream>
#include <iomanip>
#include <sstream>
#include <string>
#include <vector>

class T496_t
{
   std::array<char, 256>     m_keep;
   std::vector<std::string>  m_wordVec;
   bool                      m_dbg = false;
public:
   T496_t()
      {
         for (uint i=0; i<256; ++i)
            m_keep[i] = static_cast<char>(i);
         m_keep[uint(',')] = 0;
         m_keep[uint('.')] = 0;
      }
   ~T496_t() = default;
   int exec()
      {
         std::istringstream file(
            "Hellon"
            "I don't have power, but he has power.n"
            "I don't  have power , but he has power.n"
            ); //ifstream file("./wordCount.txt");
         uint lineCount = 1;
         while(1)
         {
            std::string line;
            (void)std::getline(file, line);
            if(file.eof())
            {
               ltrim(line);
               if(0 != line.size())
                  if(m_dbg) std::cout << __LINE__ << "  tail: " << line << std::endl;
               break;
            }
            if(m_dbg) std::cout << "n  line " << lineCount++ << " :  '"
                                << line << "'n  " << std::setfill('-')
                                << std::setw(static_cast<int>(line.size())+12)
                                << "-" << std::setfill(' ');
            std::cout << 'n';

            size_t sz = line.size();
            if(0 == sz)
               continue;     // ignore empty lines
            extractWordsFrom(line); // extract words
            if(file.eof()) break;
         }
         return(0);
      }
private: // methods
   void extractWordsFrom(std::string& unfiltered)
      {
         std::string line; // filtered
         filter(unfiltered, line);
         if(0 == line.size()) {
            if(m_dbg) std::cout << "  empty line" << std::endl; return;
         }
         size_t indx1 = 0;
         do {
            while(isspace(line[indx1])) { indx1 += 1; } // skip leading spaces
            size_t indx2 = line.find(" ", indx1);
            if(std::string::npos == indx2)
            {
               m_wordVec.push_back(line.substr(indx1));
               if(m_dbg) std::cout << "  word(" << std::setw(3) << indx1 << ", eoln): ";
               std::cout << "  " << m_wordVec.back() << std::endl;
               break;
            }
            m_wordVec.push_back(line.substr(indx1, indx2-indx1));
            if(m_dbg) std::cout << "  word(" << std::setw(3) << indx1 << ","
                                << std::setw(3) << indx2 << "): ";
            std::cout << "  " << m_wordVec.back() << std::endl;
            indx1 = indx2+1;
         }while(1);
      }
   void filter(std::string& unfiltered, std::string& line)
      {
         ltrim(unfiltered); // remove leading blanks
         for(uint i=0; i<unfiltered.size(); ++i) // transfer all chars
            if(m_keep[unfiltered[i]])            // exception check
               line.push_back(unfiltered[i]);
      }

   // trim from start
   void  ltrim(std::string &s) {
      s.erase(s.begin(),
              std::find_if(s.begin(), s.end(),
                           std::not1(std::ptr_fun<int, int>(std::isspace)) ));
   }
   // trim from end
   void rtrim(std::string &s) {
      s.erase(std::find_if(s.rbegin(), s.rend(),
                           std::not1(std::ptr_fun<int, int>(std::isspace))).base(),s.end());
   }
   // trim from both ends
   void  lrtrim(std::string &s) { rtrim(s); ltrim(s); }
}; // class T496_t

int main(int /*argc*/, char** /*argv[]*/)
{
  setlocale(LC_ALL, "");
  std::ios::sync_with_stdio(false);
  Time_t start_us = HRClk_t::now();
  int retVal = -1;
  {
     T496_t   t496;
     retVal = t496.exec();
  }
  auto  duration_us = std::chrono::duration_cast<US_t>(HRClk_t::now() - start_us);
  std::cout << "nn  FINI   " << duration_us.count() << " us" << std::endl;
  return(retVal);

}


   // desired output:
   // I
   // don't
   // have
   // power
   // but
   // he
   // has
   // power

从此代码输出:

  Hello
  I
  don't
  have
  power
  but
  he
  has
  power
  I
  don't
  have
  power
  but
  he
  has
  power

用m_dbg = true

输出
  line 1 :  'Hello'
  -----------------
  word(  0, eoln):   Hello
  line 2 :  'I don't have power, but he has power.'
  -------------------------------------------------
  word(  0,  1):   I
  word(  2,  7):   don't
  word(  8, 12):   have
  word( 13, 18):   power
  word( 19, 22):   but
  word( 23, 25):   he
  word( 26, 29):   has
  word( 30, eoln):   power
  line 3 :  'I don't  have power , but he has power.'
  ---------------------------------------------------
  word(  0,  1):   I
  word(  2,  7):   don't
  word(  9, 13):   have
  word( 14, 19):   power
  word( 21, 24):   but
  word( 25, 27):   he
  word( 28, 31):   has
  word( 32, eoln):   power

  FINI   215 us

首先是一种简单的方法来过滤字符串。删除除撇号(即'(以外的任何标点符号,然后用白空间代替它们以进一步操纵(即利用某些内置功能(。

#include <iostream>
#include <fstream>
#include <string>
#include <algorithm>
#include <sstream>
#include <iterator>
using namespace std;
bool isOk(char c)
{
    if ( ispunct(c) )
        if ( c == ''' )
            return false;
    return ispunct(c);
}
int main()
{
    ifstream file("data.txt");
    string word;
    while(file >> word){
        std::replace_if(word.begin(), word.end(), isOk, ' ');
        istringstream ss(word);
        copy(istream_iterator<string>(ss), istream_iterator<string>(), ostream_iterator<string>(cout, "n"));
    }
    return 0;
}

输出为

I
don't
have
power
but
he
has
power