#include "string_utils.h"
#include <functional>
std::vector<std::string_view> SplitString(const std::string_view& str, const std::string_view& seps, bool remove_empty_entries) {
std::vector<std::string_view> result;
std::string ::size_type pre_pos = 0;
//TODO: bug fix
while (true) {
auto next_pos = str.find_first_of(seps, pre_pos);
if (next_pos == std::string::npos) {
auto sub_str = str.substr(pre_pos, next_pos);
// sub_str is empty means the last sep reach the end of string
if (!sub_str.empty()) {
result.push_back(sub_str);
}
break;
}
if (pre_pos != next_pos || !remove_empty_entries) {
auto sub_str = str.substr(pre_pos, next_pos - pre_pos);
result.push_back(sub_str);
}
pre_pos = next_pos + 1;
}
return result;
}
bool IsCJK(char32_t c) {
return (c >= 0x4E00 && c <= 0x9FFF)
|| (c >= 0x3400 && c <= 0x4DBF)
|| (c >= 0x20000 && c <= 0x2A6DF)
|| (c >= 0x2A700 && c <= 0x2B73F)
|| (c >= 0x2B740 && c <= 0x2B81F)
|| (c >= 0x2B820 && c <= 0x2CEAF)
|| (c >= 0xF900 && c <= 0xFAFF)
|| (c >= 0x2F800 && c <= 0x2FA1F);
}
// Generated by tools/generate_unicode_category_table.py
bool IsSpace(char32_t c) {
if (c == 13||c == 32||c == 160||c == 8239||c == 8287||c == 12288) {
return true;
}
if ((c >= 9 && c <= 10)||(c >= 8192 && c <= 8202)) {
return true;
}
return false;
}
// Generated by tools/generate_unicode_category_table.py
bool IsPunct(char32_t c) {
if (c == 161||c == 167||c == 171||c == 187||c == 191||c == 894||c == 903||c == 12336||c == 12349) {
return true;
}
if ((c >= 33 && c <= 47)||(c >= 58 && c <= 64)||(c >= 91 && c <= 96)||(c >= 123 && c <= 126)
||(c >= 182 && c <= 183)||(c >= 8208 && c <= 8231)||(c >= 8240 && c <= 8259)
|| (c >= 8261 && c <= 8273)||(c >= 8275 && c <= 8286)||(c >= 12289 && c <= 12291)
||(c >= 12296 && c <= 12305)||(c >= 12308 && c <= 12319)) {
return true;
}
return false;
}
// Generated by tools/generate_unicode_category_table.py
bool IsControl(char32_t c) {
if (c == 173||c == 907||c == 909||c == 930||c == 11930||c == 173790||c == 195102
) {
return true;
}
if ((c >= 0 && c <= 8)||(c >= 11 && c <= 12)||(c >= 14 && c <= 31)||(c >= 128 && c <= 159)
||(c >= 888 && c <= 889)||(c >= 896 && c <= 899)||(c >= 8203 && c <= 8207)
||(c >= 8234 && c <= 8238)||(c >= 8288 && c <= 8302)||(c >= 12020 && c <= 12030)
||(c >= 40957 && c <= 40958)||(c >= 64110 && c <= 64111)||(c >= 64218 && c <= 64254)
||(c >= 177973 && c <= 177982)||(c >= 178206 && c <= 178207)||(c >= 183970 && c <= 183982)) {
return true;
}
return false;
}
bool IsAccent(char32_t c)
{
// only support part of accent
// [TODO] support more accent
return c >= 0x300 && c <= 0x36F;
}
char32_t ToLower(char32_t c) {
// Basic ASCII uppercase to lowercase
if (c >= 'A' && c <= 'Z') {
return c + 32;
}
// Latin-1 Supplement (À-ß)
if (c >= U'À' && c <= U'Þ') {
// Special case for ×
if(c == U'×') {
return c;
}
return c + 32;
}
// // Latin Extended-A (Ā-ſ)
// if (c >= U'Ā' && c <= U'Ʒ') {
// return c + 1;
// }
// Greek and Coptic
if (c >= U'Ἀ' && c <= U'Ὠ') {
return c + 32;
}
// Cyrillic
if (c >= U'А' && c <= U'Я') {
return c + 32;
}
// Special cases
switch (c) {
case U'Ι':
return U'ι';
case U'Κ':
return U'κ';
case U'Λ':
return U'λ';
case U'Μ':
return U'μ';
case U'Ν':
return U'ν';
case U'Ο':
return U'ο';
case U'Π':
return U'π';
}
// If no conversion rule applies, return the original character
return c;
}
char32_t StripAccent(char32_t c)
{
// "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
const char32_t * tr = U"AAAAAAÆCEEEEIIIIÐNOOOOOרUUUUYÞßaaaaaaæceeeeiiiiðnooooo÷øuuuuyþy";
if (c < 192 || c > 255) {
return c;
}
return tr[c - 192];
}
#ifdef ENABLE_TF_STRING
// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hash.cc#L28
static inline uint64_t ByteAs64(char c) { return static_cast<uint64_t>(c) & 0xff; }
// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/raw_coding.h#L41
uint64_t DecodeFixed32(const char* ptr) {
return ((static_cast<uint64_t>(static_cast<unsigned char>(ptr[0]))) |
(static_cast<uint64_t>(static_cast<unsigned char>(ptr[1])) << 8) |
(static_cast<uint64_t>(static_cast<unsigned char>(ptr[2])) << 16) |
(static_cast<uint64_t>(static_cast<unsigned char>(ptr[3])) << 24));
}
// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/raw_coding.h#L55
static uint64_t DecodeFixed64(const char* ptr) {
uint64_t lo = DecodeFixed32(ptr);
uint64_t hi = DecodeFixed32(ptr + 4);
return (hi << 32) | lo;
}
// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hash.cc#L79
uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
const uint64_t m = 0xc6a4a7935bd1e995;
const int r = 47;
uint64_t h = seed ^ (n * m);
while (n >= 8) {
uint64_t k = DecodeFixed64(data);
data += 8;
n -= 8;
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
}
switch (n) {
case 7:
h ^= ByteAs64(data[6]) << 48;
case 6:
h ^= ByteAs64(data[5]) << 40;
case 5:
h ^= ByteAs64(data[4]) << 32;
case 4:
h ^= ByteAs64(data[3]) << 24;
case 3:
h ^= ByteAs64(data[2]) << 16;
case 2:
h ^= ByteAs64(data[1]) << 8;
default: // case 1: make some code analyzer be happier.
h ^= ByteAs64(data[0]);
h *= m;
}
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
uint64_t Hash64Fast(const char* data, size_t n) {
return std::hash<std::string_view>{}(std::string_view(data, n));
}
#endif // ENABLE_TF_STRINGmicrosoft/onnxruntime-extensions
Publicmirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable
base/string_utils.cc
229lines · modepreview