mirror of
https://github.com/mangosfour/server.git
synced 2025-12-13 04:37:00 +00:00
[9231] Update used utf8 cpp library version up to 2.2.4
This commit is contained in:
parent
cee525f9c8
commit
6653539a5e
6 changed files with 311 additions and 177 deletions
|
|
@ -1,9 +1,9 @@
|
|||
utf8 cpp library
|
||||
Release 2.1
|
||||
Release 2.2.4
|
||||
|
||||
This is a minor feature release - added the function peek_next.
|
||||
This is a minor bug fix release that improves converting from utf-16 to utf-8 error detection.
|
||||
|
||||
Changes from version 2.o
|
||||
- Implemented feature request [ 1770746 ] "Provide a const version of next() (some sort of a peek() )
|
||||
Changes from version 2.2.3
|
||||
- Bug fix [2857454] dereference invalid iterator when lead surrogate was last element of the string.
|
||||
|
||||
Files included in the release: utf8.h, core.h, checked.h, unchecked.h, utf8cpp.html, ReleaseNotes
|
||||
|
|
|
|||
|
|
@ -57,6 +57,16 @@
|
|||
</li>
|
||||
<li>
|
||||
<a href="#examples">Examples of Use</a>
|
||||
<ul class="toc">
|
||||
<li>
|
||||
<a href=#introsample>Introductionary Sample </a>
|
||||
</li>
|
||||
<li>
|
||||
<a href=#validfile>Checking if a file contains valid UTF-8 text</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href=#fixinvalid>Ensure that a string contains valid UTF-8 text</a>
|
||||
</li>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#reference">Reference</a>
|
||||
|
|
@ -91,14 +101,14 @@
|
|||
</h2>
|
||||
<p>
|
||||
Many C++ developers miss an easy and portable way of handling Unicode encoded
|
||||
strings. C++ Standard is currently Unicode agnostic, and while some work is being
|
||||
done to introduce Unicode to the next incarnation called C++0x, for the moment
|
||||
nothing of the sort is available. In the meantime, developers use 3rd party
|
||||
libraries like ICU, OS specific capabilities, or simply roll out their own
|
||||
solutions.
|
||||
strings. The original C++ Standard (known as C++98 or C++03) is Unicode agnostic,
|
||||
and while some work is being done to introduce Unicode to the next incarnation
|
||||
called C++0x, for the moment nothing of the sort is available. In the meantime,
|
||||
developers use third party libraries like ICU, OS specific capabilities, or simply
|
||||
roll out their own solutions.
|
||||
</p>
|
||||
<p>
|
||||
In order to easily handle UTF-8 encoded Unicode strings, I have come up with a small
|
||||
In order to easily handle UTF-8 encoded Unicode strings, I came up with a small
|
||||
generic library. For anybody used to work with STL algorithms and iterators, it should be
|
||||
easy and natural to use. The code is freely available for any purpose - check out
|
||||
the license at the beginning of the utf8.h file. If you run into
|
||||
|
|
@ -115,11 +125,13 @@
|
|||
<h2 id="examples">
|
||||
Examples of use
|
||||
</h2>
|
||||
<h3 id="introsample">
|
||||
Introductionary Sample
|
||||
</h3>
|
||||
<p>
|
||||
To illustrate the use of this utf8 library, we shall open a file containing UTF-8
|
||||
encoded text, check whether it starts with a byte order mark, read each line into a
|
||||
<code>std::string</code>, check it for validity, convert the text to UTF-16, and
|
||||
back to UTF-8:
|
||||
To illustrate the use of the library, let's start with a small but complete program
|
||||
that opens a file containing UTF-8 encoded text, reads it line by line, checks each line
|
||||
for invalid UTF-8 byte sequences, and converts it to UTF-16 encoding and back to UTF-8:
|
||||
</p>
|
||||
<pre>
|
||||
<span class="preprocessor">#include <fstream></span>
|
||||
|
|
@ -128,33 +140,26 @@
|
|||
<span class="preprocessor">#include <vector></span>
|
||||
<span class="preprocessor">#include "utf8.h"</span>
|
||||
<span class="keyword">using namespace</span> std;
|
||||
<span class="keyword">int</span> main()
|
||||
<span class="keyword">int</span> main(<span class="keyword">int</span> argc, <span class="keyword">char</span>** argv)
|
||||
{
|
||||
<span class="keyword">if</span> (argc != <span class="literal">2</span>) {
|
||||
cout << <span class="literal">"\nUsage: docsample filename\n"</span>;
|
||||
<span class="keyword">return</span> <span class="literal">0</span>;
|
||||
}
|
||||
|
||||
<span class="keyword">const char</span>* test_file_path = argv[1];
|
||||
<span class="comment">// Open the test file (must be UTF-8 encoded)</span>
|
||||
<span class="comment">// Open the test file (contains UTF-8 encoded text)</span>
|
||||
ifstream fs8(test_file_path);
|
||||
<span class="keyword">if</span> (!fs8.is_open()) {
|
||||
cout << <span class=
|
||||
"literal">"Could not open "</span> << test_file_path << endl;
|
||||
<span class="keyword">return</span> <span class="literal">0</span>;
|
||||
}
|
||||
<span class="comment">// Read the first line of the file</span>
|
||||
|
||||
<span class="keyword">unsigned</span> line_count = <span class="literal">1</span>;
|
||||
string line;
|
||||
<span class="keyword">if</span> (!getline(fs8, line))
|
||||
<span class="keyword">return</span> <span class="literal">0</span>;
|
||||
<span class="comment">// Look for utf-8 byte-order mark at the beginning</span>
|
||||
<span class="keyword">if</span> (line.size() > <span class="literal">2</span>) {
|
||||
<span class="keyword">if</span> (utf8::is_bom(line.c_str()))
|
||||
cout << <span class=
|
||||
"literal">"There is a byte order mark at the beginning of the file\n"</span>;
|
||||
}
|
||||
<span class="comment">// Play with all the lines in the file</span>
|
||||
<span class="keyword">do</span> {
|
||||
<span class="keyword">while</span> (getline(fs8, line)) {
|
||||
<span class="comment">// check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)</span>
|
||||
string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
|
||||
<span class="keyword">if</span> (end_it != line.end()) {
|
||||
|
|
@ -165,38 +170,88 @@
|
|||
"literal">"This part is fine: "</span> << string(line.begin(), end_it) << <span
|
||||
class="literal">"\n"</span>;
|
||||
}
|
||||
|
||||
<span class="comment">// Get the line length (at least for the valid part)</span>
|
||||
<span class="keyword">int</span> length = utf8::distance(line.begin(), end_it);
|
||||
cout << <span class=
|
||||
"literal">"Length of line "</span> << line_count << <span class=
|
||||
"literal">" is "</span> << length << <span class="literal">"\n"</span>;
|
||||
|
||||
<span class="comment">// Convert it to utf-16</span>
|
||||
vector<unsigned short> utf16line;
|
||||
utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
|
||||
|
||||
<span class="comment">// And back to utf-8</span>
|
||||
string utf8line;
|
||||
utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
|
||||
|
||||
<span class="comment">// Confirm that the conversion went OK:</span>
|
||||
<span class="keyword">if</span> (utf8line != string(line.begin(), end_it))
|
||||
cout << <span class=
|
||||
"literal">"Error in UTF-16 conversion at line: "</span> << line_count << <span
|
||||
class="literal">"\n"</span>;
|
||||
getline(fs8, line);
|
||||
|
||||
line_count++;
|
||||
} <span class="keyword">while</span> (!fs8.eof());
|
||||
}
|
||||
<span class="keyword">return</span> <span class="literal">0</span>;
|
||||
}
|
||||
</pre>
|
||||
<p>
|
||||
In the previous code sample, we have seen the use of the following functions from
|
||||
<code>utf8</code> namespace: first we used <code>is_bom</code> function to detect
|
||||
UTF-8 byte order mark at the beginning of the file; then for each line we performed
|
||||
In the previous code sample, for each line we performed
|
||||
a detection of invalid UTF-8 sequences with <code>find_invalid</code>; the number
|
||||
of characters (more precisely - the number of Unicode code points) in each line was
|
||||
of characters (more precisely - the number of Unicode code points, including the end
|
||||
of line and even BOM if there is one) in each line was
|
||||
determined with a use of <code>utf8::distance</code>; finally, we have converted
|
||||
each line to UTF-16 encoding with <code>utf8to16</code> and back to UTF-8 with
|
||||
<code>utf16to8</code>.
|
||||
</p>
|
||||
<h3 id="validfile">Checking if a file contains valid UTF-8 text</h3>
|
||||
<p>
|
||||
Here is a function that checks whether the content of a file is valid UTF-8 encoded text without
|
||||
reading the content into the memory:
|
||||
</p>
|
||||
<pre>
|
||||
<span class="keyword">bool</span> valid_utf8_file(i<span class="keyword">const char</span>* file_name)
|
||||
{
|
||||
ifstream ifs(file_name);
|
||||
<span class="keyword">if</span> (!ifs)
|
||||
<span class="keyword">return false</span>; <span class="comment">// even better, throw here</span>
|
||||
|
||||
istreambuf_iterator<<span class="keyword">char</span>> it(ifs.rdbuf());
|
||||
istreambuf_iterator<<span class="keyword">char</span>> eos;
|
||||
|
||||
<span class="keyword">return</span> utf8::is_valid(it, eos);
|
||||
}
|
||||
</pre>
|
||||
<p>
|
||||
Because the function <code>utf8::is_valid()</code> works with input iterators, we were able
|
||||
to pass an <code>istreambuf_iterator</code> to it and read the content of the file directly
|
||||
without loading it to the memory first.</p>
|
||||
<p>
|
||||
Note that other functions that take input iterator arguments can be used in a similar way. For
|
||||
instance, to read the content of a UTF-8 encoded text file and convert the text to UTF-16, just
|
||||
do something like:
|
||||
</p>
|
||||
<pre>
|
||||
utf8::utf8to16(it, eos, back_inserter(u16string));
|
||||
</pre>
|
||||
<h3 id="fixinvalid">Ensure that a string contains valid UTF-8 text</h3>
|
||||
<p>
|
||||
If we have some text that "probably" contains UTF-8 encoded text and we want to
|
||||
replace any invalid UTF-8 sequence with a replacement character, something like
|
||||
the following function may be used:
|
||||
</p>
|
||||
<pre>
|
||||
<span class="keyword">void</span> fix_utf8_string(std::string& str)
|
||||
{
|
||||
std::string temp;
|
||||
utf8::replace_invalid(str.begin(), str.end(), back_inserter(temp));
|
||||
str = temp;
|
||||
}
|
||||
</pre>
|
||||
<p>The function will replace any invalid UTF-8 sequence with a Unicode replacement character.
|
||||
There is an overloaded function that enables the caller to supply their own replacement character.
|
||||
</p>
|
||||
<h2 id="reference">
|
||||
Reference
|
||||
</h2>
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ namespace utf8
|
|||
octet_iterator sequence_start = start;
|
||||
internal::utf_error err_code = internal::validate_next(start, end);
|
||||
switch (err_code) {
|
||||
case internal::OK :
|
||||
case internal::UTF8_OK :
|
||||
for (octet_iterator it = sequence_start; it != start; ++it)
|
||||
*out++ = *it;
|
||||
break;
|
||||
|
|
@ -120,15 +120,12 @@ namespace utf8
|
|||
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
}
|
||||
else if (cp <= internal::CODE_POINT_MAX) { // four octets
|
||||
else { // four octets
|
||||
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
||||
*(result++) = static_cast<uint8_t>(((cp >> 12)& 0x3f) | 0x80);
|
||||
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
|
||||
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
}
|
||||
else
|
||||
throw invalid_code_point(cp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
@ -138,7 +135,7 @@ namespace utf8
|
|||
uint32_t cp = 0;
|
||||
internal::utf_error err_code = internal::validate_next(it, end, &cp);
|
||||
switch (err_code) {
|
||||
case internal::OK :
|
||||
case internal::UTF8_OK :
|
||||
break;
|
||||
case internal::NOT_ENOUGH_ROOM :
|
||||
throw not_enough_room();
|
||||
|
|
@ -204,18 +201,22 @@ namespace utf8
|
|||
while (start != end) {
|
||||
uint32_t cp = internal::mask16(*start++);
|
||||
// Take care of surrogate pairs first
|
||||
if (internal::is_surrogate(cp)) {
|
||||
if (internal::is_lead_surrogate(cp)) {
|
||||
if (start != end) {
|
||||
uint32_t trail_surrogate = internal::mask16(*start++);
|
||||
if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX)
|
||||
if (internal::is_trail_surrogate(trail_surrogate))
|
||||
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
|
||||
else
|
||||
throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
|
||||
}
|
||||
else
|
||||
throw invalid_utf16(static_cast<uint16_t>(*start));
|
||||
throw invalid_utf16(static_cast<uint16_t>(cp));
|
||||
|
||||
}
|
||||
// Lone trail surrogate
|
||||
else if (internal::is_trail_surrogate(cp))
|
||||
throw invalid_utf16(static_cast<uint16_t>(cp));
|
||||
|
||||
result = append(cp, result);
|
||||
}
|
||||
return result;
|
||||
|
|
|
|||
|
|
@ -30,23 +30,14 @@ DEALINGS IN THE SOFTWARE.
|
|||
|
||||
#include <iterator>
|
||||
|
||||
// use MaNGOS core types
|
||||
#include "Platform/Define.h"
|
||||
|
||||
namespace utf8
|
||||
{
|
||||
// The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
|
||||
// You may need to change them to match your system.
|
||||
// These typedefs have the same names as ones from cstdint, or boost/cstdint
|
||||
|
||||
/* use MaNGOS alternatives
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
*/
|
||||
typedef uint8 uint8_t;
|
||||
typedef uint16 uint16_t;
|
||||
typedef uint32 uint32_t;
|
||||
|
||||
// Helper code - not intended to be directly called by the library users. May be changed at any time
|
||||
namespace internal
|
||||
|
|
@ -80,6 +71,18 @@ namespace internal
|
|||
return ((mask8(oc) >> 6) == 0x2);
|
||||
}
|
||||
|
||||
template <typename u16>
|
||||
inline bool is_lead_surrogate(u16 cp)
|
||||
{
|
||||
return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
|
||||
}
|
||||
|
||||
template <typename u16>
|
||||
inline bool is_trail_surrogate(u16 cp)
|
||||
{
|
||||
return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
|
||||
}
|
||||
|
||||
template <typename u16>
|
||||
inline bool is_surrogate(u16 cp)
|
||||
{
|
||||
|
|
@ -109,117 +112,192 @@ namespace internal
|
|||
return 0;
|
||||
}
|
||||
|
||||
enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
|
||||
inline bool is_overlong_sequence(uint32_t cp, int length)
|
||||
{
|
||||
if (cp < 0x80) {
|
||||
if (length != 1)
|
||||
return true;
|
||||
}
|
||||
else if (cp < 0x800) {
|
||||
if (length != 2)
|
||||
return true;
|
||||
}
|
||||
else if (cp < 0x10000) {
|
||||
if (length != 3)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
|
||||
|
||||
/// get_sequence_x functions decode utf-8 sequences of the length x
|
||||
|
||||
template <typename octet_iterator>
|
||||
utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point)
|
||||
{
|
||||
if (it != end) {
|
||||
if (code_point)
|
||||
*code_point = mask8(*it);
|
||||
return UTF8_OK;
|
||||
}
|
||||
return NOT_ENOUGH_ROOM;
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point)
|
||||
{
|
||||
utf_error ret_code = NOT_ENOUGH_ROOM;
|
||||
|
||||
if (it != end) {
|
||||
uint32_t cp = mask8(*it);
|
||||
if (++it != end) {
|
||||
if (is_trail(*it)) {
|
||||
cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
|
||||
|
||||
if (code_point)
|
||||
*code_point = cp;
|
||||
ret_code = UTF8_OK;
|
||||
}
|
||||
else
|
||||
ret_code = INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
else
|
||||
ret_code = NOT_ENOUGH_ROOM;
|
||||
}
|
||||
|
||||
return ret_code;
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point)
|
||||
{
|
||||
utf_error ret_code = NOT_ENOUGH_ROOM;
|
||||
|
||||
if (it != end) {
|
||||
uint32_t cp = mask8(*it);
|
||||
if (++it != end) {
|
||||
if (is_trail(*it)) {
|
||||
cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
|
||||
if (++it != end) {
|
||||
if (is_trail(*it)) {
|
||||
cp += (*it) & 0x3f;
|
||||
|
||||
if (code_point)
|
||||
*code_point = cp;
|
||||
ret_code = UTF8_OK;
|
||||
}
|
||||
else
|
||||
ret_code = INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
else
|
||||
ret_code = NOT_ENOUGH_ROOM;
|
||||
}
|
||||
else
|
||||
ret_code = INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
else
|
||||
ret_code = NOT_ENOUGH_ROOM;
|
||||
}
|
||||
|
||||
return ret_code;
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point)
|
||||
{
|
||||
utf_error ret_code = NOT_ENOUGH_ROOM;
|
||||
|
||||
if (it != end) {
|
||||
uint32_t cp = mask8(*it);
|
||||
if (++it != end) {
|
||||
if (is_trail(*it)) {
|
||||
cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
|
||||
if (++it != end) {
|
||||
if (is_trail(*it)) {
|
||||
cp += (mask8(*it) << 6) & 0xfff;
|
||||
if (++it != end) {
|
||||
if (is_trail(*it)) {
|
||||
cp += (*it) & 0x3f;
|
||||
|
||||
if (code_point)
|
||||
*code_point = cp;
|
||||
ret_code = UTF8_OK;
|
||||
}
|
||||
else
|
||||
ret_code = INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
else
|
||||
ret_code = NOT_ENOUGH_ROOM;
|
||||
}
|
||||
else
|
||||
ret_code = INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
else
|
||||
ret_code = NOT_ENOUGH_ROOM;
|
||||
}
|
||||
else
|
||||
ret_code = INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
else
|
||||
ret_code = NOT_ENOUGH_ROOM;
|
||||
}
|
||||
|
||||
return ret_code;
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
|
||||
{
|
||||
uint32_t cp = mask8(*it);
|
||||
// Check the lead octet
|
||||
// Save the original value of it so we can go back in case of failure
|
||||
// Of course, it does not make much sense with i.e. stream iterators
|
||||
octet_iterator original_it = it;
|
||||
|
||||
uint32_t cp = 0;
|
||||
// Determine the sequence length based on the lead octet
|
||||
typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
|
||||
octet_difference_type length = sequence_length(it);
|
||||
|
||||
// "Shortcut" for ASCII characters
|
||||
if (length == 1) {
|
||||
if (end - it > 0) {
|
||||
if (code_point)
|
||||
*code_point = cp;
|
||||
++it;
|
||||
return OK;
|
||||
}
|
||||
else
|
||||
return NOT_ENOUGH_ROOM;
|
||||
}
|
||||
|
||||
// Do we have enough memory?
|
||||
if (std::distance(it, end) < length)
|
||||
return NOT_ENOUGH_ROOM;
|
||||
|
||||
// Check trail octets and calculate the code point
|
||||
switch (length) {
|
||||
case 0:
|
||||
if (length == 0)
|
||||
return INVALID_LEAD;
|
||||
|
||||
// Now that we have a valid sequence length, get trail octets and calculate the code point
|
||||
utf_error err = UTF8_OK;
|
||||
switch (length) {
|
||||
case 1:
|
||||
err = get_sequence_1(it, end, &cp);
|
||||
break;
|
||||
case 2:
|
||||
if (is_trail(*(++it))) {
|
||||
cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
|
||||
}
|
||||
else {
|
||||
--it;
|
||||
return INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
err = get_sequence_2(it, end, &cp);
|
||||
break;
|
||||
case 3:
|
||||
if (is_trail(*(++it))) {
|
||||
cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
|
||||
if (is_trail(*(++it))) {
|
||||
cp += (*it) & 0x3f;
|
||||
}
|
||||
else {
|
||||
std::advance(it, -2);
|
||||
return INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
}
|
||||
else {
|
||||
--it;
|
||||
return INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
err = get_sequence_3(it, end, &cp);
|
||||
break;
|
||||
case 4:
|
||||
if (is_trail(*(++it))) {
|
||||
cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
|
||||
if (is_trail(*(++it))) {
|
||||
cp += (mask8(*it) << 6) & 0xfff;
|
||||
if (is_trail(*(++it))) {
|
||||
cp += (*it) & 0x3f;
|
||||
}
|
||||
else {
|
||||
std::advance(it, -3);
|
||||
return INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
}
|
||||
else {
|
||||
std::advance(it, -2);
|
||||
return INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
}
|
||||
else {
|
||||
--it;
|
||||
return INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
err = get_sequence_4(it, end, &cp);
|
||||
break;
|
||||
}
|
||||
// Is the code point valid?
|
||||
if (!is_code_point_valid(cp)) {
|
||||
for (octet_difference_type i = 0; i < length - 1; ++i)
|
||||
--it;
|
||||
return INVALID_CODE_POINT;
|
||||
}
|
||||
|
||||
if (err == UTF8_OK) {
|
||||
// Decoding succeeded. Now, security checks...
|
||||
if (is_code_point_valid(cp)) {
|
||||
if (!is_overlong_sequence(cp, length)){
|
||||
// Passed! Return here.
|
||||
if (code_point)
|
||||
*code_point = cp;
|
||||
|
||||
if (cp < 0x80) {
|
||||
if (length != 1) {
|
||||
std::advance(it, -(length-1));
|
||||
return OVERLONG_SEQUENCE;
|
||||
}
|
||||
}
|
||||
else if (cp < 0x800) {
|
||||
if (length != 2) {
|
||||
std::advance(it, -(length-1));
|
||||
return OVERLONG_SEQUENCE;
|
||||
}
|
||||
}
|
||||
else if (cp < 0x10000) {
|
||||
if (length != 3) {
|
||||
std::advance(it, -(length-1));
|
||||
return OVERLONG_SEQUENCE;
|
||||
}
|
||||
}
|
||||
|
||||
++it;
|
||||
return OK;
|
||||
return UTF8_OK;
|
||||
}
|
||||
else
|
||||
err = OVERLONG_SEQUENCE;
|
||||
}
|
||||
else
|
||||
err = INVALID_CODE_POINT;
|
||||
}
|
||||
|
||||
// Failure branch - restore the original value of the iterator
|
||||
it = original_it;
|
||||
return err;
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
|
|
@ -240,7 +318,7 @@ namespace internal
|
|||
octet_iterator result = start;
|
||||
while (result != end) {
|
||||
internal::utf_error err_code = internal::validate_next(result, end);
|
||||
if (err_code != internal::OK)
|
||||
if (err_code != internal::UTF8_OK)
|
||||
return result;
|
||||
}
|
||||
return result;
|
||||
|
|
|
|||
|
|
@ -45,13 +45,13 @@ namespace utf8
|
|||
}
|
||||
else if (cp < 0x10000) { // three octets
|
||||
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
|
||||
*(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
|
||||
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
}
|
||||
else { // four octets
|
||||
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
||||
*(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f | 0x80);
|
||||
*(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
|
||||
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
|
||||
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
}
|
||||
return result;
|
||||
|
|
@ -132,7 +132,7 @@ namespace utf8
|
|||
while (start != end) {
|
||||
uint32_t cp = internal::mask16(*start++);
|
||||
// Take care of surrogate pairs first
|
||||
if (internal::is_surrogate(cp)) {
|
||||
if (internal::is_lead_surrogate(cp)) {
|
||||
uint32_t trail_surrogate = internal::mask16(*start++);
|
||||
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
#ifndef __REVISION_NR_H__
|
||||
#define __REVISION_NR_H__
|
||||
#define REVISION_NR "9230"
|
||||
#define REVISION_NR "9231"
|
||||
#endif // __REVISION_NR_H__
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue