C++将UTF-8字符串作为UTF-16输出到std::cout

C++ Output UTF-8 strings as UTF-16 to std::cout

本文关键字：输出 std UTF-16 cout UTF-8 字符串 C++ 更新时间：2023-10-16

我有很多使用C++03、STL和Boost 1.54基于UTF-8编写的代码。
所有代码通过std::cout或std::cerr向控制台输出数据
我不想在我的代码库中引入新的库，也不想切换到C++11，但我想将代码移植到Windows
重写所有代码以使用std::wcout或std::wcerr而不是std::cout或std::cerr不是我想要的，但我仍然想显示所有在控制台上作为UTF-16。
在将所有基于字符的数据（UTF-8编码）写入控制台之前，是否有方法更改std::cout和std::cerr以将其转换为基于wchar_t的数据（UTF-16编码）
如果能看到一个只使用C++03、STL和Boost 1.54的解决方案，那就太好了。
我发现Boost Locale有单个字符串的转换函数，Boost Spirit中有一个UTF-8到UTF-32的迭代器，但如果不使用额外的库或切换到C++11，我找不到任何方面的codecvt来将UTF-8转换为UTF-16。

提前谢谢。

附言：我知道这样做是可行的，但我希望在这里找到更好的解决方案。

我没有想出比已经暗示的更好的解决方案
因此，我将在这里为任何感兴趣的人分享基于streambuf的解决方案。希望有人能想出一个更好的解决方案并在这里分享。

#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>

#if defined(_WIN32) && defined(_UNICODE) && (defined(__MSVCRT__) ||defined(_MSC_VER))
#define TEST_ARG_TYPE wchar_t
#else /* not windows, unicode */
#define TEST_ARG_TYPE char
#endif /* windows, unicode */

#ifndef _O_U16TEXT
#define _O_U16TEXT 0x20000
#endif

static size_t countValidUtf8Bytes(const unsigned char * buf, const size_t size) {
    size_t i, charSize;
    const unsigned char * src = buf;
    for (i = 0; i < size && (*src) != 0; i += charSize, src += charSize) {
        charSize = 0;
        if ((*src) >= 0xFC) {
            charSize = 6;
        } else if ((*src) >= 0xF8) {
            charSize = 5;
        } else if ((*src) >= 0xF0) {
            charSize = 4;
        } else if ((*src) >= 0xE0) {
            charSize = 3;
        } else if ((*src) >= 0xC0) {
            charSize = 2;
        } else if ((*src) >= 0x80) {
            /* Skip continuous UTF-8 character (should never happen). */
            for (; (i + charSize) < size && src[charSize] != 0 && src[charSize] >= 0x80; charSize++) {
                charSize++;
            }
        } else {
            /* ASCII character. */
            charSize = 1;
        }
        if ((i + charSize) > size) break;
    }
    return i;
}

#if defined(_WIN32) && defined(_UNICODE) && (defined(__MSVCRT__) ||defined(_MSC_VER))
#include <locale>
#include <streambuf>
#include <boost/locale.hpp>
extern "C" {
#include <fcntl.h>
#include <io.h>
#include <windows.h>
int _CRT_glob;
extern void __wgetmainargs(int *, wchar_t ***, wchar_t ***, int, int *);
}

class Utf8ToUtf16Buffer : public std::basic_streambuf< char, std::char_traits<char> > {
private:
    char * outBuf;
    FILE * outFd;
public:
    static const size_t BUFFER_SIZE = 1024;
    typedef std::char_traits<char> traits_type;
    typedef traits_type::int_type int_type;
    typedef traits_type::pos_type pos_type;
    typedef traits_type::off_type off_type;
    explicit Utf8ToUtf16Buffer(FILE * o) : outBuf(new char[BUFFER_SIZE]), outFd(o) {
        /* Initialize the put pointer. Overflow won't get called until this
         * buffer is filled up, so we need to use valid pointers.
         */
        this->setp(outBuf, outBuf + BUFFER_SIZE - 1);
    }
    ~Utf8ToUtf16Buffer() {
        delete[] outBuf;
    }
protected:
    virtual int_type overflow(int_type c);
    virtual int_type sync();
};

Utf8ToUtf16Buffer::int_type Utf8ToUtf16Buffer::overflow(Utf8ToUtf16Buffer::int_type c) {
    char * iBegin = this->outBuf;
    char * iEnd = this->pptr();
    int_type result = traits_type::not_eof(c);
    /* If this is the end, add an eof character to the buffer.
     * This is why the pointers passed to setp are off by 1
     * (to reserve room for this).
     */
    if ( ! traits_type::eq_int_type(c, traits_type::eof()) ) {
        *iEnd = traits_type::to_char_type(c);
        iEnd++;
    }
    /* Calculate output data length. */
    int_type iLen = static_cast<int_type>(iEnd - iBegin);
    int_type iLenU8 = static_cast<int_type>(
        countValidUtf8Bytes(reinterpret_cast<const unsigned char *>(iBegin), static_cast<size_t>(iLen))
    );
    /* Convert string to UTF-16 and write to defined file descriptor. */
    if (fwprintf(this->outFd, boost::locale::conv::utf_to_utf<wchar_t>(std::string(outBuf, outBuf + iLenU8)).c_str()) < 0) {
        /* Failed to write data to output file descriptor. */
        result = traits_type::eof();
    }
    /* Reset the put pointers to indicate that the buffer is free. */
    if (iLenU8 == iLen) {
        this->setp(outBuf, outBuf + BUFFER_SIZE + 1);
    } else {
        /* Move incomplete UTF-8 characters remaining in buffer. */
        const size_t overhead = static_cast<size_t>(iLen - iLenU8);
        memmove(outBuf, outBuf + iLenU8, overhead);
        this->setp(outBuf + overhead, outBuf + BUFFER_SIZE + 1);
    }
    return result;
}

Utf8ToUtf16Buffer::int_type Utf8ToUtf16Buffer::sync() {
    return traits_type::eq_int_type(this->overflow(traits_type::eof()), traits_type::eof()) ? -1 : 0;
}
#endif /* windows, unicode */

int test_main(int argc, TEST_ARG_TYPE ** argv);

#if defined(_WIN32) && defined(_UNICODE) && (defined(__MSVCRT__) ||defined(_MSC_VER))
int main(/*int argc, char ** argv*/) {
    wchar_t ** wenpv, ** wargv;
    int wargc, si = 0;
    /* this also creates the global variable __wargv */
    __wgetmainargs(&wargc, &wargv, &wenpv, _CRT_glob, &si);
    /* enable UTF-16 output to standard output console */
    _setmode(_fileno(stdout), _O_U16TEXT);
    std::locale::global(boost::locale::generator().generate("UTF-8"));
    Utf8ToUtf16Buffer u8cout(stdout);
    std::streambuf * out = std::cout.rdbuf();
    std::cout.rdbuf(&u8cout);
    /* process user defined main function */
    const int result = test_main(wargc, wargv);
    /* revert stream buffers to let cout clean up remaining memory correctly */
    std::cout.rdbuf(out);
    return result;
#else /* not windows or unicode */
int main(int argc, char ** argv) {
    return test_main(argc, argv);
#endif /* windows, unicode */
}
int test_main(int /*argc*/, TEST_ARG_TYPE ** /*argv*/) {
    const std::string str("x61x62x63xC3xA4xC3xB6xC3xBCxE3x81x82xE3x81x88xE3x81x84xE3x82xA2xE3x82xA8xE3x82xA4xE4xBAx9CxE6xB1x9FxE6x84x8F");
    for (size_t i = 1; i <= str.size(); i++) {
        const std::string part(str.begin(), str.begin() + i);
        const size_t validByteCount = countValidUtf8Bytes(reinterpret_cast<const unsigned char *>(part.c_str()), part.size());
        wprintf(L"i = %u, v = %un", i, validByteCount);
        const std::string valid(str.begin(), str.begin() + validByteCount);
        std::cout << valid << std::endl;
        std::cout.flush();
        for (size_t j = 0; j < part.size(); j++) {
            wprintf(L"%02X", static_cast<int>(part[j]) & 0xFF);
        }
        wprintf(L"n");
    }
    return EXIT_SUCCESS;
}

我觉得这可能是个坏主意。。但我认为，只要控制台的字体正确，它仍然可以被视为有效。。

#include <iostream>
#include <windows.h>
//#include <io.h>
//#include <fcntl.h>
std::wstring UTF8ToUTF16(const char* utf8)
{
    std::wstring utf16;
    int len = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL, 0);
    if (len > 1)
    {
        utf16.resize(len);
        MultiByteToWideChar(CP_UTF8, 0, utf8, -1, &utf16[0], len);
    }
    return utf16;
}
std::ostream& operator << (std::ostream& os, const char* data)
{
    //_setmode(_fileno(stdout), _O_U16TEXT);
    SetConsoleCP(1200);
    std::wstring str = UTF8ToUTF16(data);
    DWORD slen = str.size();
    WriteConsoleW(GetStdHandle(STD_OUTPUT_HANDLE), str.c_str(), slen, &slen, nullptr);
    MessageBoxW(NULL, str.c_str(), L"", 0);
    return os;
}
std::ostream& operator << (std::ostream& os, const std::string& data)
{
    //_setmode(_fileno(stdout), _O_U16TEXT);
    SetConsoleCP(1200);
    std::wstring str = UTF8ToUTF16(&data[0]);
    DWORD slen = str.size();
    WriteConsoleW(GetStdHandle(STD_OUTPUT_HANDLE), str.c_str(), slen, &slen, nullptr);
    return os;
}
std::wostream& operator <<(std::wostream& os, const wchar_t* data)
{
    DWORD slen = wcslen(data);
    WriteConsoleW(GetStdHandle(STD_OUTPUT_HANDLE), data, slen, &slen, nullptr);
    return os;
}
std::wostream& operator <<(std::wostream& os, const std::wstring& data)
{
    WriteConsoleW(GetStdHandle(STD_OUTPUT_HANDLE), data.c_str(), data.size(), nullptr, nullptr);
    return os;
}
int main()
{
    std::cout<<"Россия";
}

现在cout和std::wcout都使用WriteConsoleW函数。。你必须为const char*, char*, std::string, char等过载。无论你需要什么。。也许是模板。