microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
13d9e27ccd8a0de9a1225756fbf6860a1931484f

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

operators/text/re2_strings/string_regex_split_re.hpp

53lines · modecode

1// Copyright (c) Microsoft Corporation. All rights reserved.
2// Licensed under the MIT License.
3
4#pragma once
5#include <string>
6#include <algorithm>
7#include "re2/re2.h"
8
9template <typename T>
10void RegexSplitImpl(const std::string& input, const RE2& pattern,
11 bool include_delimiter, const RE2& include_delim_regex,
12 std::vector<std::string_view>& tokens,
13 std::vector<T>& begin_offsets,
14 std::vector<T>& end_offsets) {
15 re2::StringPiece leftover(input.data());
16 re2::StringPiece last_end = leftover;
17 re2::StringPiece extracted_delim_token;
18
19 // Keep looking for split points until we have reached the end of the input.
20 while (RE2::FindAndConsume(&leftover, pattern, &extracted_delim_token)) {
21 std::string_view token(last_end.data(),
22 extracted_delim_token.data() - last_end.data());
23 bool has_non_empty_token = token.length() > 0;
24 bool should_include_delim =
25 include_delimiter && include_delim_regex.FullMatch(
26 extracted_delim_token, include_delim_regex);
27 last_end = leftover;
28
29 // Mark the end of the previous token, only if there was something.
30 if (has_non_empty_token) {
31 tokens.push_back(std::string_view(token.data(), token.size()));
32 // Mark the end of the last token
33 begin_offsets.push_back(token.data() - input.data());
34 end_offsets.push_back(token.data() + token.length() - input.data());
35 }
36
37 if (should_include_delim) {
38 // If desired, include the deliminator as a token.
39 tokens.push_back(std::string_view(extracted_delim_token.data(), extracted_delim_token.size()));
40 // Mark the end of the token at the end of the beginning of the delimiter.
41 begin_offsets.push_back(extracted_delim_token.data() - input.data());
42 end_offsets.push_back(extracted_delim_token.data() +
43 extracted_delim_token.length() - input.data());
44 }
45 }
46
47 // Close the last token.
48 if (!leftover.empty()) {
49 tokens.push_back(std::string_view(leftover.data(), leftover.size()));
50 begin_offsets.push_back(leftover.data() - input.data());
51 end_offsets.push_back(leftover.data() + leftover.length() - input.data());
52 }
53}