Open Chinese Convert  1.1.9
A project for conversion between Traditional and Simplified Chinese
UTF8Util.hpp
1 /*
2  * Open Chinese Convert
3  *
4  * Copyright 2013 Carbo Kuo <byvoid@byvoid.com>
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #pragma once
20 
21 #ifdef _MSC_VER
22 #ifndef NOMINMAX
23 #define NOMINMAX
24 #endif
25 #include <Windows.h>
26 #endif // _MSC_VER
27 
28 #include <cstring>
29 
30 #include "Common.hpp"
31 #include "Exception.hpp"
32 
33 namespace opencc {
38 class OPENCC_EXPORT UTF8Util {
39 public:
43  static void SkipUtf8Bom(FILE* fp);
44 
49  static size_t NextCharLengthNoException(const char* str) {
50  char ch = *str;
51  if ((ch & 0xF0) == 0xE0) {
52  return 3;
53  } else if ((ch & 0x80) == 0x00) {
54  return 1;
55  } else if ((ch & 0xE0) == 0xC0) {
56  return 2;
57  } else if ((ch & 0xF8) == 0xF0) {
58  return 4;
59  } else if ((ch & 0xFC) == 0xF8) {
60  return 5;
61  } else if ((ch & 0xFE) == 0xFC) {
62  return 6;
63  }
64  return 0;
65  }
66 
70  static size_t NextCharLength(const char* str) {
71  size_t length = NextCharLengthNoException(str);
72  if (length == 0) {
73  throw InvalidUTF8(str);
74  }
75  return length;
76  }
77 
81  static size_t PrevCharLength(const char* str) {
82  {
83  const size_t length = NextCharLengthNoException(str - 3);
84  if (length == 3) {
85  return length;
86  }
87  }
88  {
89  const size_t length = NextCharLengthNoException(str - 1);
90  if (length == 1) {
91  return length;
92  }
93  }
94  {
95  const size_t length = NextCharLengthNoException(str - 2);
96  if (length == 2) {
97  return length;
98  }
99  }
100  for (size_t i = 4; i <= 6; i++) {
101  const size_t length = NextCharLengthNoException(str - i);
102  if (length == i) {
103  return length;
104  }
105  }
106  throw InvalidUTF8(str);
107  }
108 
112  static const char* NextChar(const char* str) {
113  return str + NextCharLength(str);
114  }
115 
119  static const char* PrevChar(const char* str) {
120  return str - PrevCharLength(str);
121  }
122 
126  static size_t Length(const char* str) {
127  size_t length = 0;
128  while (*str != '\0') {
129  str = NextChar(str);
130  length++;
131  }
132  return length;
133  }
134 
141  static const char* FindNextInline(const char* str, const char ch) {
142  while (!IsLineEndingOrFileEnding(*str) && *str != ch) {
143  str = NextChar(str);
144  }
145  return str;
146  }
147 
151  static bool IsLineEndingOrFileEnding(const char ch) {
152  return ch == '\0' || ch == '\n' || ch == '\r';
153  }
154 
158  static std::string FromSubstr(const char* str, size_t length) {
159  std::string newStr;
160  newStr.resize(length);
161  strncpy(const_cast<char*>(newStr.c_str()), str, length);
162  return newStr;
163  }
164 
169  static bool NotShorterThan(const char* str, size_t byteLength) {
170  while (byteLength > 0) {
171  if (*str == '\0') {
172  return false;
173  }
174  byteLength--;
175  str++;
176  }
177  return true;
178  }
179 
184  static std::string TruncateUTF8(const char* str, size_t maxByteLength) {
185  std::string wordTrunc;
186  if (NotShorterThan(str, maxByteLength)) {
187  size_t len = 0;
188  const char* pStr = str;
189  for (;;) {
190  const size_t charLength = NextCharLength(pStr);
191  if (len + charLength > maxByteLength) {
192  break;
193  }
194  pStr += charLength;
195  len += charLength;
196  }
197  wordTrunc = FromSubstr(str, len);
198  } else {
199  wordTrunc = str;
200  }
201  return wordTrunc;
202  }
203 
207  static void ReplaceAll(std::string& str, const char* from, const char* to) {
208  std::string::size_type pos = 0;
209  std::string::size_type fromLen = strlen(from);
210  std::string::size_type toLen = strlen(to);
211  while ((pos = str.find(from, pos)) != std::string::npos) {
212  str.replace(pos, fromLen, to);
213  pos += toLen;
214  }
215  }
216 
220  static std::string Join(const std::vector<std::string>& strings,
221  const std::string& separator) {
222  std::ostringstream buffer;
223  bool first = true;
224  for (const auto& str : strings) {
225  if (!first) {
226  buffer << separator;
227  }
228  buffer << str;
229  first = false;
230  }
231  return buffer.str();
232  }
233 
237  static std::string Join(const std::vector<std::string>& strings) {
238  std::ostringstream buffer;
239  for (const auto& str : strings) {
240  buffer << str;
241  }
242  return buffer.str();
243  }
244 
245  static void GetByteMap(const char* str, const size_t utf8Length,
246  std::vector<size_t>* byteMap) {
247  if (byteMap->size() < utf8Length) {
248  byteMap->resize(utf8Length);
249  }
250  const char* pstr = str;
251  for (size_t i = 0; i < utf8Length; i++) {
252  (*byteMap)[i] = pstr - str;
253  pstr = NextChar(pstr);
254  }
255  }
256 
257 #ifdef _MSC_VER
258  static std::wstring GetPlatformString(const std::string& str) {
259  return U8ToU16(str);
260  }
261 #else
262  static std::string GetPlatformString(const std::string& str) { return str; }
263 #endif // _MSC_VER
264 
265 #ifdef _MSC_VER
266  static std::string U16ToU8(const std::wstring& wstr) {
267  std::string ret;
268  int length = static_cast<int>(wstr.length());
269  int convcnt = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, NULL, 0,
270  NULL, NULL);
271  if (convcnt > 0) {
272  ret.resize(convcnt);
273  WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, &ret[0], convcnt,
274  NULL, NULL);
275  }
276  return ret;
277  }
278 
279  static std::wstring U8ToU16(const std::string& str) {
280  std::wstring ret;
281  int length = static_cast<int>(str.length());
282  int convcnt = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, NULL, 0);
283  if (convcnt > 0) {
284  ret.resize(convcnt);
285  MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, &ret[0], convcnt);
286  }
287  return ret;
288  }
289 #endif // _MSC_VER
290 };
291 } // namespace opencc
Definition: Exception.hpp:77
UTF8 std::string utilities.
Definition: UTF8Util.hpp:38
static bool IsLineEndingOrFileEnding(const char ch)
Returns true if the character is a line ending or end of file.
Definition: UTF8Util.hpp:151
static size_t PrevCharLength(const char *str)
Returns the length in byte for the previous UTF8 character.
Definition: UTF8Util.hpp:81
static std::string FromSubstr(const char *str, size_t length)
Copies a substring with given length to a new string.
Definition: UTF8Util.hpp:158
static void ReplaceAll(std::string &str, const char *from, const char *to)
Replaces all patterns in a std::string in place.
Definition: UTF8Util.hpp:207
static const char * FindNextInline(const char *str, const char ch)
Finds a character in the same line.
Definition: UTF8Util.hpp:141
static size_t NextCharLengthNoException(const char *str)
Returns the length in byte for the next UTF8 character.
Definition: UTF8Util.hpp:49
static bool NotShorterThan(const char *str, size_t byteLength)
Returns true if the given std::string is longer or as long as the given length.
Definition: UTF8Util.hpp:169
static const char * PrevChar(const char *str)
Move the char* pointer before the previous UTF8 character.
Definition: UTF8Util.hpp:119
static std::string Join(const std::vector< std::string > &strings)
Joins a std::string vector in to a std::string.
Definition: UTF8Util.hpp:237
static std::string TruncateUTF8(const char *str, size_t maxByteLength)
Truncates a std::string with a maximal length in byte.
Definition: UTF8Util.hpp:184
static size_t Length(const char *str)
Returns the UTF8 length of a valid UTF8 std::string.
Definition: UTF8Util.hpp:126
static const char * NextChar(const char *str)
Returns the char* pointer over the next UTF8 character.
Definition: UTF8Util.hpp:112
static size_t NextCharLength(const char *str)
Returns the length in byte for the next UTF8 character.
Definition: UTF8Util.hpp:70
static std::string Join(const std::vector< std::string > &strings, const std::string &separator)
Joins a std::string vector in to a std::string with a separator.
Definition: UTF8Util.hpp:220