suyu/src/common/string_util.cpp

// Copyright 2013 Dolphin Emulator Project
// Licensed under GPLv2
// Refer to the license.txt file included.

#include <algorithm>

#include "common/common.h"
#include "common/string_util.h"

#ifdef _WIN32
    #include <Windows.h>
    #include <codecvt>
#else
    #include <iconv.h>
#endif

namespace Common {

/// Make a string lowercase
std::string ToLower(std::string str) {
    std::transform(str.begin(), str.end(), str.begin(), ::tolower);
    return str;
}

/// Make a string uppercase
std::string ToUpper(std::string str) {
    std::transform(str.begin(), str.end(), str.begin(), ::toupper);
    return str;
}

// faster than sscanf
bool AsciiToHex(const char* _szValue, u32& result)
{
    char *endptr = NULL;
    const u32 value = strtoul(_szValue, &endptr, 16);

    if (!endptr || *endptr)
        return false;

    result = value;
    return true;
}

bool CharArrayFromFormatV(char* out, int outsize, const char* format, va_list args)
{
    int writtenCount;

#ifdef _WIN32
    // You would think *printf are simple, right? Iterate on each character,
    // if it's a format specifier handle it properly, etc.
    //
    // Nooooo. Not according to the C standard.
    //
    // According to the C99 standard (7.19.6.1 "The fprintf function")
    //     The format shall be a multibyte character sequence
    //
    // Because some character encodings might have '%' signs in the middle of
    // a multibyte sequence (SJIS for example only specifies that the first
    // byte of a 2 byte sequence is "high", the second byte can be anything),
    // printf functions have to decode the multibyte sequences and try their
    // best to not screw up.
    //
    // Unfortunately, on Windows, the locale for most languages is not UTF-8
    // as we would need. Notably, for zh_TW, Windows chooses EUC-CN as the
    // locale, and completely fails when trying to decode UTF-8 as EUC-CN.
    //
    // On the other hand, the fix is simple: because we use UTF-8, no such
    // multibyte handling is required as we can simply assume that no '%' char
    // will be present in the middle of a multibyte sequence.
    //
    // This is why we lookup an ANSI (cp1252) locale here and use _vsnprintf_l.
    static locale_t c_locale = NULL;
    if (!c_locale)
        c_locale = _create_locale(LC_ALL, ".1252");
    writtenCount = _vsnprintf_l(out, outsize, format, c_locale, args);
#else
    writtenCount = vsnprintf(out, outsize, format, args);
#endif

    if (writtenCount > 0 && writtenCount < outsize)
    {
        out[writtenCount] = '\0';
        return true;
    }
    else
    {
        out[outsize - 1] = '\0';
        return false;
    }
}

std::string StringFromFormat(const char* format, ...)
{
    va_list args;
    char *buf = NULL;
#ifdef _WIN32
    int required = 0;

    va_start(args, format);
    required = _vscprintf(format, args);
    buf = new char[required + 1];
    CharArrayFromFormatV(buf, required + 1, format, args);
    va_end(args);

    std::string temp = buf;
    delete[] buf;
#else
    va_start(args, format);
    if (vasprintf(&buf, format, args) < 0)
        ERROR_LOG(COMMON, "Unable to allocate memory for string");
    va_end(args);

    std::string temp = buf;
    free(buf);
#endif
    return temp;
}

// For Debugging. Read out an u8 array.
std::string ArrayToString(const u8 *data, u32 size, int line_len, bool spaces)
{
    std::ostringstream oss;
    oss << std::setfill('0') << std::hex;
    
    for (int line = 0; size; ++data, --size)
    {
        oss << std::setw(2) << (int)*data;
        
        if (line_len == ++line)
        {
            oss << '\n';
            line = 0;
        }
        else if (spaces)
            oss << ' ';
    }

    return oss.str();
}

// Turns "  hej " into "hej". Also handles tabs.
std::string StripSpaces(const std::string &str)
{
    const size_t s = str.find_first_not_of(" \t\r\n");

    if (str.npos != s)
        return str.substr(s, str.find_last_not_of(" \t\r\n") - s + 1);
    else
        return "";
}

// "\"hello\"" is turned to "hello"
// This one assumes that the string has already been space stripped in both
// ends, as done by StripSpaces above, for example.
std::string StripQuotes(const std::string& s)
{
    if (s.size() && '\"' == s[0] && '\"' == *s.rbegin())
        return s.substr(1, s.size() - 2);
    else
        return s;
}

bool TryParse(const std::string &str, u32 *const output)
{
    char *endptr = NULL;

    // Reset errno to a value other than ERANGE
    errno = 0;

    unsigned long value = strtoul(str.c_str(), &endptr, 0);
    
    if (!endptr || *endptr)
        return false;

    if (errno == ERANGE)
        return false;

#if ULONG_MAX > UINT_MAX
    if (value >= 0x100000000ull
        && value <= 0xFFFFFFFF00000000ull)
        return false;
#endif

    *output = static_cast<u32>(value);
    return true;
}

bool TryParse(const std::string &str, bool *const output)
{
    if ("1" == str || "true" == ToLower(str))
        *output = true;
    else if ("0" == str || "false" == ToLower(str))
        *output = false;
    else
        return false;

    return true;
}

std::string StringFromBool(bool value)
{
    return value ? "True" : "False";
}

bool SplitPath(const std::string& full_path, std::string* _pPath, std::string* _pFilename, std::string* _pExtension)
{
    if (full_path.empty())
        return false;

    size_t dir_end = full_path.find_last_of("/"
    // windows needs the : included for something like just "C:" to be considered a directory
#ifdef _WIN32
        ":"
#endif
    );
    if (std::string::npos == dir_end)
        dir_end = 0;
    else
        dir_end += 1;

    size_t fname_end = full_path.rfind('.');
    if (fname_end < dir_end || std::string::npos == fname_end)
        fname_end = full_path.size();

    if (_pPath)
        *_pPath = full_path.substr(0, dir_end);

    if (_pFilename)
        *_pFilename = full_path.substr(dir_end, fname_end - dir_end);

    if (_pExtension)
        *_pExtension = full_path.substr(fname_end);

    return true;
}

void BuildCompleteFilename(std::string& _CompleteFilename, const std::string& _Path, const std::string& _Filename)
{
    _CompleteFilename = _Path;

    // check for seperator
    if (DIR_SEP_CHR != *_CompleteFilename.rbegin())
        _CompleteFilename += DIR_SEP_CHR;

    // add the filename
    _CompleteFilename += _Filename;
}

void SplitString(const std::string& str, const char delim, std::vector<std::string>& output)
{
    std::istringstream iss(str);
    output.resize(1);

    while (std::getline(iss, *output.rbegin(), delim))
        output.push_back("");

    output.pop_back();
}

std::string TabsToSpaces(int tab_size, const std::string &in)
{
    const std::string spaces(tab_size, ' ');
    std::string out(in);

    size_t i = 0;
    while (out.npos != (i = out.find('\t')))
        out.replace(i, 1, spaces);

    return out;
}

std::string ReplaceAll(std::string result, const std::string& src, const std::string& dest)
{
    size_t pos = 0;

    if (src == dest)
        return result;

    while ((pos = result.find(src, pos)) != std::string::npos)
    {
        result.replace(pos, src.size(), dest);
        pos += dest.length();
    }

    return result;
}

// UriDecode and UriEncode are from http://www.codeguru.com/cpp/cpp/string/conversions/print.php/c12759
// by jinq0123 (November 2, 2006)

// Uri encode and decode.
// RFC1630, RFC1738, RFC2396

//#include <string>
//#include <assert.h>

const char HEX2DEC[256] = 
{
    /*       0  1  2  3   4  5  6  7   8  9  A  B   C  D  E  F */
    /* 0 */ 16,16,16,16, 16,16,16,16, 16,16,16,16, 16,16,16,16,
    /* 1 */ 16,16,16,16, 16,16,16,16, 16,16,16,16, 16,16,16,16,
    /* 2 */ 16,16,16,16, 16,16,16,16, 16,16,16,16, 16,16,16,16,
    /* 3 */  0, 1, 2, 3,  4, 5, 6, 7,  8, 9,16,16, 16,16,16,16,

    /* 4 */ 16,10,11,12, 13,14,15,16, 16,16,16,16, 16,16,16,16,
    /* 5 */ 16,16,16,16, 16,16,16,16, 16,16,16,16, 16,16,16,16,
    /* 6 */ 16,10,11,12, 13,14,15,16, 16,16,16,16, 16,16,16,16,
    /* 7 */ 16,16,16,16, 16,16,16,16, 16,16,16,16, 16,16,16,16,

    /* 8 */ 16,16,16,16, 16,16,16,16, 16,16,16,16, 16,16,16,16,
    /* 9 */ 16,16,16,16, 16,16,16,16, 16,16,16,16, 16,16,16,16,
    /* A */ 16,16,16,16, 16,16,16,16, 16,16,16,16, 16,16,16,16,
    /* B */ 16,16,16,16, 16,16,16,16, 16,16,16,16, 16,16,16,16,

    /* C */ 16,16,16,16, 16,16,16,16, 16,16,16,16, 16,16,16,16,
    /* D */ 16,16,16,16, 16,16,16,16, 16,16,16,16, 16,16,16,16,
    /* E */ 16,16,16,16, 16,16,16,16, 16,16,16,16, 16,16,16,16,
    /* F */ 16,16,16,16, 16,16,16,16, 16,16,16,16, 16,16,16,16
};

std::string UriDecode(const std::string & sSrc)
{
    // Note from RFC1630:  "Sequences which start with a percent sign
    // but are not followed by two hexadecimal characters (0-9, A-F) are reserved
    // for future extension"

    const unsigned char * pSrc = (const unsigned char *)sSrc.c_str();
    const size_t SRC_LEN = sSrc.length();
    const unsigned char * const SRC_END = pSrc + SRC_LEN;
    const unsigned char * const SRC_LAST_DEC = SRC_END - 2;   // last decodable '%' 

    char * const pStart = new char[SRC_LEN];
    char * pEnd = pStart;

    while (pSrc < SRC_LAST_DEC)
    {
        if (*pSrc == '%')
        {
            char dec1, dec2;
            if (16 != (dec1 = HEX2DEC[*(pSrc + 1)])
                && 16 != (dec2 = HEX2DEC[*(pSrc + 2)]))
            {
                *pEnd++ = (dec1 << 4) + dec2;
                pSrc += 3;
                continue;
            }
        }

        *pEnd++ = *pSrc++;
    }

    // the last 2- chars
    while (pSrc < SRC_END)
        *pEnd++ = *pSrc++;

    std::string sResult(pStart, pEnd);
    delete [] pStart;
    return sResult;
}

// Only alphanum is safe.
const char SAFE[256] =
{
    /*      0 1 2 3  4 5 6 7  8 9 A B  C D E F */
    /* 0 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
    /* 1 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
    /* 2 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
    /* 3 */ 1,1,1,1, 1,1,1,1, 1,1,0,0, 0,0,0,0,

    /* 4 */ 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
    /* 5 */ 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
    /* 6 */ 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
    /* 7 */ 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,

    /* 8 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
    /* 9 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
    /* A */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
    /* B */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,

    /* C */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
    /* D */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
    /* E */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
    /* F */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
};

std::string UriEncode(const std::string & sSrc)
{
    const char DEC2HEX[16 + 1] = "0123456789ABCDEF";
    const unsigned char * pSrc = (const unsigned char *)sSrc.c_str();
    const size_t SRC_LEN = sSrc.length();
    unsigned char * const pStart = new unsigned char[SRC_LEN * 3];
    unsigned char * pEnd = pStart;
    const unsigned char * const SRC_END = pSrc + SRC_LEN;

    for (; pSrc < SRC_END; ++pSrc)
    {
        if (SAFE[*pSrc]) 
            *pEnd++ = *pSrc;
        else
        {
            // escape this char
            *pEnd++ = '%';
            *pEnd++ = DEC2HEX[*pSrc >> 4];
            *pEnd++ = DEC2HEX[*pSrc & 0x0F];
        }
    }

    std::string sResult((char *)pStart, (char *)pEnd);
    delete [] pStart;
    return sResult;
}

#ifdef _WIN32

std::string UTF16ToUTF8(const std::u16string& input)
{
    std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert;
    return convert.to_bytes(input);
}

std::u16string UTF8ToUTF16(const std::string& input)
{
    std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert;
    return convert.from_bytes(input);
}

static std::string UTF16ToUTF8(const std::wstring& input)
{
    auto const size = WideCharToMultiByte(CP_UTF8, 0, input.data(), input.size(), nullptr, 0, nullptr, nullptr);

    std::string output;
    output.resize(size);

    if (size == 0 || size != WideCharToMultiByte(CP_UTF8, 0, input.data(), input.size(), &output[0], output.size(), nullptr, nullptr))
        output.clear();

    return output;
}

static std::wstring CPToUTF16(u32 code_page, const std::string& input)
{
    auto const size = MultiByteToWideChar(code_page, 0, input.data(), input.size(), nullptr, 0);

    std::wstring output;
    output.resize(size);

    if (size == 0 || size != MultiByteToWideChar(code_page, 0, input.data(), input.size(), &output[0], output.size()))
        output.clear();

    return output;
}

std::wstring UTF8ToUTF16W(const std::string &input)
{
    return CPToUTF16(CP_UTF8, input);
}

std::string SHIFTJISToUTF8(const std::string& input)
{
    return UTF16ToUTF8(CPToUTF16(932, input));
}

std::string CP1252ToUTF8(const std::string& input)
{
    return UTF16ToUTF8(CPToUTF16(1252, input));
}

#else

template <typename T>
static std::string CodeToUTF8(const char* fromcode, const std::basic_string<T>& input)
{
    std::string result;

    iconv_t const conv_desc = iconv_open("UTF-8", fromcode);
    if ((iconv_t)(-1) == conv_desc)
    {
        ERROR_LOG(COMMON, "Iconv initialization failure [%s]: %s", fromcode, strerror(errno));
        iconv_close(conv_desc);
        return {};
    }

    const size_t in_bytes = sizeof(T) * input.size();
    // Multiply by 4, which is the max number of bytes to encode a codepoint
    const size_t out_buffer_size = 4 * in_bytes;

    std::string out_buffer;
    out_buffer.resize(out_buffer_size);

    auto src_buffer = &input[0];
    size_t src_bytes = in_bytes;
    auto dst_buffer = &out_buffer[0];
    size_t dst_bytes = out_buffer.size();

    while (0 != src_bytes)
    {
        size_t const iconv_result = iconv(conv_desc, (char**)(&src_buffer), &src_bytes,
            &dst_buffer, &dst_bytes);

        if (static_cast<size_t>(-1) == iconv_result)
        {
            if (EILSEQ == errno || EINVAL == errno)
            {
                // Try to skip the bad character
                if (0 != src_bytes)
                {
                    --src_bytes;
                    ++src_buffer;
                }
            }
            else
            {
                ERROR_LOG(COMMON, "iconv failure [%s]: %s", fromcode, strerror(errno));
                break;
            }
        }
    }

    out_buffer.resize(out_buffer_size - dst_bytes);
    out_buffer.swap(result);
    
    iconv_close(conv_desc);
    
    return result;
}

std::u16string UTF8ToUTF16(const std::string& input)
{
    std::u16string result;

    iconv_t const conv_desc = iconv_open("UTF-16", "UTF-8");
    if ((iconv_t)(-1) == conv_desc)
    {
        ERROR_LOG(COMMON, "Iconv initialization failure [UTF-8]: %s", strerror(errno));
        iconv_close(conv_desc);
        return {};
    }

    const size_t in_bytes = sizeof(char) * input.size();
    // Multiply by 4, which is the max number of bytes to encode a codepoint
    const size_t out_buffer_size = 4 * sizeof(char16_t) * in_bytes;

    std::u16string out_buffer;
    out_buffer.resize(out_buffer_size);

    char* src_buffer = const_cast<char*>(&input[0]);
    size_t src_bytes = in_bytes;
    char* dst_buffer = (char*)(&out_buffer[0]);
    size_t dst_bytes = out_buffer.size();

    while (0 != src_bytes)
    {
        size_t const iconv_result = iconv(conv_desc, &src_buffer, &src_bytes,
                                          &dst_buffer, &dst_bytes);

        if (static_cast<size_t>(-1) == iconv_result)
        {
            if (EILSEQ == errno || EINVAL == errno)
            {
                // Try to skip the bad character
                if (0 != src_bytes)
                {
                    --src_bytes;
                    ++src_buffer;
                }
            }
            else
            {
                ERROR_LOG(COMMON, "iconv failure [UTF-8]: %s", strerror(errno));
                break;
            }
        }
    }

    out_buffer.resize(out_buffer_size - dst_bytes);
    out_buffer.swap(result);

    iconv_close(conv_desc);
    
    return result;
}

std::string UTF16ToUTF8(const std::u16string& input)
{
    return CodeToUTF8("UTF-16", input);
}

std::string CP1252ToUTF8(const std::string& input)
{
    //return CodeToUTF8("CP1252//TRANSLIT", input);
    //return CodeToUTF8("CP1252//IGNORE", input);
    return CodeToUTF8("CP1252", input);
}

std::string SHIFTJISToUTF8(const std::string& input)
{
    //return CodeToUTF8("CP932", input);
    return CodeToUTF8("SJIS", input);
}

#endif

}