28 #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 37 #if !defined UTF_CPP_CPLUSPLUS 38 #define UTF_CPP_CPLUSPLUS __cplusplus 41 #if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later 42 #define UTF_CPP_OVERRIDE override 43 #define UTF_CPP_NOEXCEPT noexcept 45 #define UTF_CPP_OVERRIDE 46 #define UTF_CPP_NOEXCEPT throw() 47 #endif // C++ 11 or later 55 typedef unsigned char uint8_t;
56 typedef unsigned short uint16_t;
57 typedef unsigned int uint32_t;
65 const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
66 const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
67 const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
68 const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
69 const uint16_t LEAD_OFFSET = 0xd7c0u;
70 const uint32_t SURROGATE_OFFSET = 0xfca02400u;
73 const uint32_t CODE_POINT_MAX = 0x0010ffffu;
75 template<
typename octet_type>
76 inline uint8_t mask8(octet_type oc)
78 return static_cast<uint8_t
>(0xff & oc);
80 template<
typename u16_type>
81 inline uint16_t mask16(u16_type oc)
83 return static_cast<uint16_t
>(0xffff & oc);
85 template<
typename octet_type>
86 inline bool is_trail(octet_type oc)
88 return ((utf8::internal::mask8(oc) >> 6) == 0x2);
91 template <
typename u16>
92 inline bool is_lead_surrogate(u16 cp)
94 return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
97 template <
typename u16>
98 inline bool is_trail_surrogate(u16 cp)
100 return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
103 template <
typename u16>
104 inline bool is_surrogate(u16 cp)
106 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
109 template <
typename u32>
110 inline bool is_code_point_valid(u32 cp)
112 return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
115 template <
typename octet_iterator>
116 inline typename std::iterator_traits<octet_iterator>::difference_type
117 sequence_length(octet_iterator lead_it)
119 uint8_t lead = utf8::internal::mask8(*lead_it);
122 else if ((lead >> 5) == 0x6)
124 else if ((lead >> 4) == 0xe)
126 else if ((lead >> 3) == 0x1e)
132 template <
typename octet_difference_type>
133 inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
139 else if (cp < 0x800) {
143 else if (cp < 0x10000) {
151 enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
154 template <
typename octet_iterator>
155 utf_error increase_safely(octet_iterator& it, octet_iterator end)
158 return NOT_ENOUGH_ROOM;
160 if (!utf8::internal::is_trail(*it))
161 return INCOMPLETE_SEQUENCE;
166 #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} 169 template <
typename octet_iterator>
170 utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
173 return NOT_ENOUGH_ROOM;
175 code_point = utf8::internal::mask8(*it);
180 template <
typename octet_iterator>
181 utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
184 return NOT_ENOUGH_ROOM;
186 code_point = utf8::internal::mask8(*it);
188 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
190 code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
195 template <
typename octet_iterator>
196 utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
199 return NOT_ENOUGH_ROOM;
201 code_point = utf8::internal::mask8(*it);
203 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
205 code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
207 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
209 code_point += (*it) & 0x3f;
214 template <
typename octet_iterator>
215 utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
218 return NOT_ENOUGH_ROOM;
220 code_point = utf8::internal::mask8(*it);
222 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
224 code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
226 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
228 code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
230 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
232 code_point += (*it) & 0x3f;
237 #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR 239 template <
typename octet_iterator>
240 utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
243 return NOT_ENOUGH_ROOM;
247 octet_iterator original_it = it;
251 typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
252 const octet_difference_type length = utf8::internal::sequence_length(it);
255 utf_error err = UTF8_OK;
260 err = utf8::internal::get_sequence_1(it, end, cp);
263 err = utf8::internal::get_sequence_2(it, end, cp);
266 err = utf8::internal::get_sequence_3(it, end, cp);
269 err = utf8::internal::get_sequence_4(it, end, cp);
273 if (err == UTF8_OK) {
275 if (utf8::internal::is_code_point_valid(cp)) {
276 if (!utf8::internal::is_overlong_sequence(cp, length)){
283 err = OVERLONG_SEQUENCE;
286 err = INVALID_CODE_POINT;
294 template <
typename octet_iterator>
295 inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
297 return utf8::internal::validate_next(it, end, ignored);
305 const uint8_t bom[] = {0xef, 0xbb, 0xbf};
307 template <
typename octet_iterator>
308 octet_iterator find_invalid(octet_iterator start, octet_iterator end)
310 octet_iterator result = start;
311 while (result != end) {
312 utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
313 if (err_code != internal::UTF8_OK)
319 template <
typename octet_iterator>
320 inline bool is_valid(octet_iterator start, octet_iterator end)
322 return (utf8::find_invalid(start, end) == end);
325 template <
typename octet_iterator>
326 inline bool starts_with_bom (octet_iterator it, octet_iterator end)
329 ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
330 ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
331 ((it != end) && (utf8::internal::mask8(*it)) == bom[2])
336 #endif // header guard