microsoft/onnxruntime-extensions
Publicmirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable
operators/text/re2_strings/string_regex_split_re.hpp
53lines · modecode
| 1 | // Copyright (c) Microsoft Corporation. All rights reserved. |
| 2 | // Licensed under the MIT License. |
| 3 | |
| 4 | #pragma once |
| 5 | #include <string> |
| 6 | #include <algorithm> |
| 7 | #include "re2/re2.h" |
| 8 | |
| 9 | template <typename T> |
| 10 | void RegexSplitImpl(const std::string& input, const RE2& pattern, |
| 11 | bool include_delimiter, const RE2& include_delim_regex, |
| 12 | std::vector<std::string_view>& tokens, |
| 13 | std::vector<T>& begin_offsets, |
| 14 | std::vector<T>& end_offsets) { |
| 15 | re2::StringPiece leftover(input.data()); |
| 16 | re2::StringPiece last_end = leftover; |
| 17 | re2::StringPiece extracted_delim_token; |
| 18 | |
| 19 | // Keep looking for split points until we have reached the end of the input. |
| 20 | while (RE2::FindAndConsume(&leftover, pattern, &extracted_delim_token)) { |
| 21 | std::string_view token(last_end.data(), |
| 22 | extracted_delim_token.data() - last_end.data()); |
| 23 | bool has_non_empty_token = token.length() > 0; |
| 24 | bool should_include_delim = |
| 25 | include_delimiter && include_delim_regex.FullMatch( |
| 26 | extracted_delim_token, include_delim_regex); |
| 27 | last_end = leftover; |
| 28 | |
| 29 | // Mark the end of the previous token, only if there was something. |
| 30 | if (has_non_empty_token) { |
| 31 | tokens.push_back(std::string_view(token.data(), token.size())); |
| 32 | // Mark the end of the last token |
| 33 | begin_offsets.push_back(token.data() - input.data()); |
| 34 | end_offsets.push_back(token.data() + token.length() - input.data()); |
| 35 | } |
| 36 | |
| 37 | if (should_include_delim) { |
| 38 | // If desired, include the deliminator as a token. |
| 39 | tokens.push_back(std::string_view(extracted_delim_token.data(), extracted_delim_token.size())); |
| 40 | // Mark the end of the token at the end of the beginning of the delimiter. |
| 41 | begin_offsets.push_back(extracted_delim_token.data() - input.data()); |
| 42 | end_offsets.push_back(extracted_delim_token.data() + |
| 43 | extracted_delim_token.length() - input.data()); |
| 44 | } |
| 45 | } |
| 46 | |
| 47 | // Close the last token. |
| 48 | if (!leftover.empty()) { |
| 49 | tokens.push_back(std::string_view(leftover.data(), leftover.size())); |
| 50 | begin_offsets.push_back(leftover.data() - input.data()); |
| 51 | end_offsets.push_back(leftover.data() + leftover.length() - input.data()); |
| 52 | } |
| 53 | } |