[9231] Update used utf8 cpp library version up to 2.2.4

2025-12-12 19:37:03 +00:00 · 2010-01-21 21:41:21 +03:00 · 2010-01-21 21:41:21 +03:00 · 6653539a5e
commit 6653539a5e
parent cee525f9c8
6 changed files with 311 additions and 177 deletions
--- a/dep/include/utf8cpp/doc/ReleaseNotes
+++ b/dep/include/utf8cpp/doc/ReleaseNotes
@ -1,9 +1,9 @@
 utf8 cpp library
-Release 2.1
+Release 2.2.4

-This is a minor feature release - added the function peek_next. 
+This is a minor bug fix release that improves converting from utf-16 to utf-8 error detection.

-Changes from version 2.o
- Implemented feature request [ 1770746 ] "Provide a const version of next() (some sort of a peek() )
+Changes from version 2.2.3
+- Bug fix [2857454]	dereference invalid iterator when lead surrogate was last element of the string.

 Files included in the release: utf8.h, core.h, checked.h, unchecked.h, utf8cpp.html, ReleaseNotes
--- a/dep/include/utf8cpp/doc/utf8cpp.html
+++ b/dep/include/utf8cpp/doc/utf8cpp.html
@ -57,6 +57,16 @@
        </li>
        <li>
          <a href="#examples">Examples of Use</a>
+          <ul class="toc">
+            <li>
+              <a href=#introsample>Introductionary Sample </a>
+            </li>
+            <li>
+              <a href=#validfile>Checking if a file contains valid UTF-8 text</a>
+            </li>
+            <li>
+              <a href=#fixinvalid>Ensure that a string contains valid UTF-8 text</a>
+            </li>
        </li>
        <li>
          <a href="#reference">Reference</a>
@ -91,14 +101,14 @@
    </h2>
    <p>
      Many C++ developers miss an easy and portable way of handling Unicode encoded
-      strings. C++ Standard is currently Unicode agnostic, and while some work is being
-      done to introduce Unicode to the next incarnation called C++0x, for the moment
-      nothing of the sort is available. In the meantime, developers use 3rd party
-      libraries like ICU, OS specific capabilities, or simply roll out their own
-      solutions.
+      strings. The original C++ Standard (known as C++98 or C++03) is Unicode agnostic,
+      and while some work is being done to introduce Unicode to the next incarnation
+      called C++0x, for the moment nothing of the sort is available. In the meantime,
+      developers use third party libraries like ICU, OS specific capabilities, or simply
+      roll out their own solutions.
    </p>
    <p>
-      In order to easily handle UTF-8 encoded Unicode strings, I have come up with a small
+      In order to easily handle UTF-8 encoded Unicode strings, I came up with a small
      generic library. For anybody used to work with STL algorithms and iterators, it should be
      easy and natural to use. The code is freely available for any purpose - check out
      the license at the beginning of the utf8.h file. If you run into
@ -115,11 +125,13 @@
    <h2 id="examples">
      Examples of use
    </h2>
+    <h3 id="introsample">
+      Introductionary Sample
+    </h3>
    <p>
-      To illustrate the use of this utf8 library, we shall open a file containing UTF-8
-      encoded text, check whether it starts with a byte order mark, read each line into a
-      <code>std::string</code>, check it for validity, convert the text to UTF-16, and
-      back to UTF-8:
+      To illustrate the use of the library, let's start with a small but complete program 
+      that opens a file containing UTF-8 encoded text, reads it line by line, checks each line
+      for invalid UTF-8 byte sequences, and converts it to UTF-16 encoding and back to UTF-8:
    </p>
 <pre>
 <span class="preprocessor">#include &lt;fstream&gt;</span>
@ -128,33 +140,26 @@
 <span class="preprocessor">#include &lt;vector&gt;</span>
 <span class="preprocessor">#include "utf8.h"</span>
 <span class="keyword">using namespace</span> std;
-<span class="keyword">int</span> main()
+<span class="keyword">int</span> main(<span class="keyword">int</span> argc, <span class="keyword">char</span>** argv)
 {
    <span class="keyword">if</span> (argc != <span class="literal">2</span>) {
        cout &lt;&lt; <span class="literal">"\nUsage: docsample filename\n"</span>;
        <span class="keyword">return</span> <span class="literal">0</span>;
    }
+
    <span class="keyword">const char</span>* test_file_path = argv[1];
-    <span class="comment">// Open the test file (must be UTF-8 encoded)</span>
+    <span class="comment">// Open the test file (contains UTF-8 encoded text)</span>
    ifstream fs8(test_file_path);
    <span class="keyword">if</span> (!fs8.is_open()) {
    cout &lt;&lt; <span class=
 "literal">"Could not open "</span> &lt;&lt; test_file_path &lt;&lt; endl;
    <span class="keyword">return</span> <span class="literal">0</span>;
    }
-    <span class="comment">// Read the first line of the file</span>
+
    <span class="keyword">unsigned</span> line_count = <span class="literal">1</span>;
    string line;
-    <span class="keyword">if</span> (!getline(fs8, line)) 
-        <span class="keyword">return</span> <span class="literal">0</span>;
-    <span class="comment">// Look for utf-8 byte-order mark at the beginning</span>
-    <span class="keyword">if</span> (line.size() &gt; <span class="literal">2</span>) {
-        <span class="keyword">if</span> (utf8::is_bom(line.c_str()))
-            cout &lt;&lt; <span class=
-"literal">"There is a byte order mark at the beginning of the file\n"</span>;
-    }
    <span class="comment">// Play with all the lines in the file</span>
-    <span class="keyword">do</span> {
+    <span class="keyword">while</span> (getline(fs8, line)) {
       <span class="comment">// check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)</span>
        string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
        <span class="keyword">if</span> (end_it != line.end()) {
@ -165,38 +170,88 @@
 "literal">"This part is fine: "</span> &lt;&lt; string(line.begin(), end_it) &lt;&lt; <span
 class="literal">"\n"</span>;
        }
+
        <span class="comment">// Get the line length (at least for the valid part)</span>
        <span class="keyword">int</span> length = utf8::distance(line.begin(), end_it);
        cout &lt;&lt; <span class=
 "literal">"Length of line "</span> &lt;&lt; line_count &lt;&lt; <span class=
 "literal">" is "</span> &lt;&lt; length &lt;&lt;  <span class="literal">"\n"</span>;
+
        <span class="comment">// Convert it to utf-16</span>
        vector&lt;unsigned short&gt; utf16line;
        utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
+
        <span class="comment">// And back to utf-8</span>
        string utf8line; 
        utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
+
        <span class="comment">// Confirm that the conversion went OK:</span>
        <span class="keyword">if</span> (utf8line != string(line.begin(), end_it))
            cout &lt;&lt; <span class=
 "literal">"Error in UTF-16 conversion at line: "</span> &lt;&lt; line_count &lt;&lt; <span
 class="literal">"\n"</span>;        
-        getline(fs8, line);
+
        line_count++;
-    } <span class="keyword">while</span> (!fs8.eof());
+    }
    <span class="keyword">return</span> <span class="literal">0</span>;
 }
 </pre>
    <p>
-      In the previous code sample, we have seen the use of the following functions from
-      <code>utf8</code> namespace: first we used <code>is_bom</code> function to detect
-      UTF-8 byte order mark at the beginning of the file; then for each line we performed
+      In the previous code sample, for each line we performed
      a detection of invalid UTF-8 sequences with <code>find_invalid</code>; the number
-      of characters (more precisely - the number of Unicode code points) in each line was
+      of characters (more precisely - the number of Unicode code points, including the end
+      of line and even BOM if there is one) in each line was
      determined with a use of <code>utf8::distance</code>; finally, we have converted
      each line to UTF-16 encoding with <code>utf8to16</code> and back to UTF-8 with
      <code>utf16to8</code>.
    </p>
+    <h3 id="validfile">Checking if a file contains valid UTF-8 text</h3>
+<p>
+Here is a function that checks whether the content of a file is valid UTF-8 encoded text without
+reading the content into the memory:
+</p>
+<pre>    
+<span class="keyword">bool</span> valid_utf8_file(i<span class="keyword">const char</span>* file_name)
+{
+    ifstream ifs(file_name);
+    <span class="keyword">if</span> (!ifs)
+        <span class="keyword">return false</span>; <span class="comment">// even better, throw here</span>
+
+    istreambuf_iterator&lt;<span class="keyword">char</span>&gt; it(ifs.rdbuf());
+    istreambuf_iterator&lt;<span class="keyword">char</span>&gt; eos;
+
+    <span class="keyword">return</span> utf8::is_valid(it, eos);
+}
+</pre>
+<p>
+Because the function <code>utf8::is_valid()</code> works with input iterators, we were able
+to pass an <code>istreambuf_iterator</code> to it and read the content of the file directly 
+without loading it to the memory first.</p>
+<p>
+Note that other functions that take input iterator arguments can be used in a similar way. For
+instance, to read the content of a UTF-8 encoded text file and convert the text to UTF-16, just 
+do something like:
+</p>
+<pre>
+    utf8::utf8to16(it, eos, back_inserter(u16string));
+</pre>
+    <h3 id="fixinvalid">Ensure that a string contains valid UTF-8 text</h3>
+<p>
+If we have some text that "probably" contains UTF-8 encoded text and we want to
+replace any invalid UTF-8 sequence with a replacement character, something like 
+the following function may be used:
+</p>
+<pre>
+<span class="keyword">void</span> fix_utf8_string(std::string&amp; str)
+{
+    std::string temp;
+    utf8::replace_invalid(str.begin(), str.end(), back_inserter(temp));
+    str = temp;
+}
+</pre>
+<p>The function will replace any invalid UTF-8 sequence with a Unicode replacement character. 
+There is an overloaded function that enables the caller to supply their own replacement character.
+</p>
    <h2 id="reference">
      Reference
    </h2>
--- a/dep/include/utf8cpp/utf8/checked.h
+++ b/dep/include/utf8cpp/utf8/checked.h
@ -64,7 +64,7 @@ namespace utf8
    };

    /// The library API - functions intended to be called by the users
- 
+
    template <typename octet_iterator, typename output_iterator>
    output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
    {
@ -72,7 +72,7 @@ namespace utf8
            octet_iterator sequence_start = start;
            internal::utf_error err_code = internal::validate_next(start, end);
            switch (err_code) {
-                case internal::OK :
+                case internal::UTF8_OK :
                    for (octet_iterator it = sequence_start; it != start; ++it)
                        *out++ = *it;
                    break;
@ -92,7 +92,7 @@ namespace utf8
                        ++start;
                    break;
            }
-        }   
+        }
        return out;
    }

@ -106,11 +106,11 @@ namespace utf8
    template <typename octet_iterator>
    octet_iterator append(uint32_t cp, octet_iterator result)
    {
-        if (!internal::is_code_point_valid(cp)) 
+        if (!internal::is_code_point_valid(cp))
            throw invalid_code_point(cp);

        if (cp < 0x80)                        // one octet
-            *(result++) = static_cast<uint8_t>(cp);  
+            *(result++) = static_cast<uint8_t>(cp);
        else if (cp < 0x800) {                // two octets
            *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
@ -120,15 +120,12 @@ namespace utf8
            *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
        }
-        else if (cp <= internal::CODE_POINT_MAX) {      // four octets
+        else {      // four octets
            *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
-            *(result++) = static_cast<uint8_t>(((cp >> 12)& 0x3f)   | 0x80);
+            *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
            *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
        }
-        else
-            throw invalid_code_point(cp);
-
        return result;
    }

@ -138,7 +135,7 @@ namespace utf8
        uint32_t cp = 0;
        internal::utf_error err_code = internal::validate_next(it, end, &cp);
        switch (err_code) {
-            case internal::OK :
+            case internal::UTF8_OK :
                break;
            case internal::NOT_ENOUGH_ROOM :
                throw not_enough_room();
@ -149,7 +146,7 @@ namespace utf8
            case internal::INVALID_CODE_POINT :
                throw invalid_code_point(cp);
        }
-        return cp;        
+        return cp;
    }

    template <typename octet_iterator>
@ -162,7 +159,7 @@ namespace utf8
    uint32_t prior(octet_iterator& it, octet_iterator start)
    {
        octet_iterator end = it;
-        while (internal::is_trail(*(--it))) 
+        while (internal::is_trail(*(--it)))
            if (it < start)
                throw invalid_utf8(*it); // error - no lead byte in the sequence
        octet_iterator temp = it;
@ -174,7 +171,7 @@ namespace utf8
    uint32_t previous(octet_iterator& it, octet_iterator pass_start)
    {
        octet_iterator end = it;
-        while (internal::is_trail(*(--it))) 
+        while (internal::is_trail(*(--it)))
            if (it == pass_start)
                throw invalid_utf8(*it); // error - no lead byte in the sequence
        octet_iterator temp = it;
@ -193,32 +190,36 @@ namespace utf8
    distance (octet_iterator first, octet_iterator last)
    {
        typename std::iterator_traits<octet_iterator>::difference_type dist;
-        for (dist = 0; first < last; ++dist) 
+        for (dist = 0; first < last; ++dist)
            next(first, last);
        return dist;
    }

    template <typename u16bit_iterator, typename octet_iterator>
    octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
-    {       
+    {
        while (start != end) {
            uint32_t cp = internal::mask16(*start++);
            // Take care of surrogate pairs first
-            if (internal::is_surrogate(cp)) {
+            if (internal::is_lead_surrogate(cp)) {
                if (start != end) {
                    uint32_t trail_surrogate = internal::mask16(*start++);
-                    if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX)
-                        cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;                    
-                    else 
+                    if (internal::is_trail_surrogate(trail_surrogate))
+                        cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
+                    else
                        throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
                }
-                else 
-                    throw invalid_utf16(static_cast<uint16_t>(*start));
-            
+                else
+                    throw invalid_utf16(static_cast<uint16_t>(cp));
+
            }
+            // Lone trail surrogate
+            else if (internal::is_trail_surrogate(cp))
+                throw invalid_utf16(static_cast<uint16_t>(cp));
+
            result = append(cp, result);
        }
-        return result;        
+        return result;
    }

    template <typename u16bit_iterator, typename octet_iterator>
@ -256,13 +257,13 @@ namespace utf8

    // The iterator class
    template <typename octet_iterator>
-    class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 
+    class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
      octet_iterator it;
      octet_iterator range_start;
      octet_iterator range_end;
      public:
      iterator () {};
-      explicit iterator (const octet_iterator& octet_it, 
+      explicit iterator (const octet_iterator& octet_it,
                         const octet_iterator& range_start,
                         const octet_iterator& range_end) :
               it(octet_it), range_start(range_start), range_end(range_end)
@ -277,8 +278,8 @@ namespace utf8
          octet_iterator temp = it;
          return next(temp, range_end);
      }
-      bool operator == (const iterator& rhs) const 
-      { 
+      bool operator == (const iterator& rhs) const
+      {
          if (range_start != rhs.range_start || range_end != rhs.range_end)
              throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
          return (it == rhs.it);
@ -287,7 +288,7 @@ namespace utf8
      {
          return !(operator == (rhs));
      }
-      iterator& operator ++ () 
+      iterator& operator ++ ()
      {
          next(it, range_end);
          return *this;
@ -297,7 +298,7 @@ namespace utf8
          iterator temp = *this;
          next(it, range_end);
          return temp;
-      }  
+      }
      iterator& operator -- ()
      {
          prior(it, range_start);
--- a/dep/include/utf8cpp/utf8/core.h
+++ b/dep/include/utf8cpp/utf8/core.h
@ -30,27 +30,18 @@ DEALINGS IN THE SOFTWARE.

 #include <iterator>

-// use MaNGOS core types
-#include "Platform/Define.h"
-
 namespace utf8
 {
    // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
-    // You may need to change them to match your system. 
+    // You may need to change them to match your system.
    // These typedefs have the same names as ones from cstdint, or boost/cstdint
-
-    /* use MaNGOS alternatives
    typedef unsigned char   uint8_t;
    typedef unsigned short  uint16_t;
    typedef unsigned int    uint32_t;
-    */
-    typedef uint8  uint8_t;
-    typedef uint16 uint16_t;
-    typedef uint32 uint32_t;

 // Helper code - not intended to be directly called by the library users. May be changed at any time
 namespace internal
-{    
+{
    // Unicode constants
    // Leading (high) surrogates: 0xd800 - 0xdbff
    // Trailing (low) surrogates: 0xdc00 - 0xdfff
@ -80,6 +71,18 @@ namespace internal
        return ((mask8(oc) >> 6) == 0x2);
    }

+    template <typename u16>
+    inline bool is_lead_surrogate(u16 cp)
+    {
+        return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
+    }
+
+    template <typename u16>
+    inline bool is_trail_surrogate(u16 cp)
+    {
+        return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
+    }
+
    template <typename u16>
    inline bool is_surrogate(u16 cp)
    {
@ -90,14 +93,14 @@ namespace internal
    inline bool is_code_point_valid(u32 cp)
    {
        return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
-    }  
+    }

    template <typename octet_iterator>
    inline typename std::iterator_traits<octet_iterator>::difference_type
    sequence_length(octet_iterator lead_it)
    {
        uint8_t lead = mask8(*lead_it);
-        if (lead < 0x80) 
+        if (lead < 0x80)
            return 1;
        else if ((lead >> 5) == 0x6)
            return 2;
@ -105,121 +108,196 @@ namespace internal
            return 3;
        else if ((lead >> 3) == 0x1e)
            return 4;
-        else 
+        else
            return 0;
    }

-    enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
+    inline bool is_overlong_sequence(uint32_t cp, int length)
+    {
+        if (cp < 0x80) {
+            if (length != 1) 
+                return true;
+        }
+        else if (cp < 0x800) {
+            if (length != 2) 
+                return true;
+        }
+        else if (cp < 0x10000) {
+            if (length != 3) 
+                return true;
+        }
+
+        return false;
+    }
+
+    enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
+
+    /// get_sequence_x functions decode utf-8 sequences of the length x
+
+    template <typename octet_iterator>
+    utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point)
+    {
+        if (it != end) {
+            if (code_point)
+                *code_point = mask8(*it);
+            return UTF8_OK;
+        }
+        return NOT_ENOUGH_ROOM;
+    }
+
+    template <typename octet_iterator>
+    utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point)
+    {
+        utf_error ret_code = NOT_ENOUGH_ROOM;
+
+        if (it != end) {
+            uint32_t cp = mask8(*it);
+            if (++it != end) {
+                if (is_trail(*it)) {
+                    cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
+
+                    if (code_point)
+                        *code_point = cp;
+                    ret_code = UTF8_OK;
+                }
+                else
+                    ret_code = INCOMPLETE_SEQUENCE;
+            }
+            else
+                ret_code = NOT_ENOUGH_ROOM;
+        }
+
+        return ret_code;
+    }
+
+    template <typename octet_iterator>
+    utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point)
+    {
+        utf_error ret_code = NOT_ENOUGH_ROOM;
+
+        if (it != end) {
+            uint32_t cp = mask8(*it);
+            if (++it != end) {
+                if (is_trail(*it)) {
+                    cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
+                    if (++it != end) {
+                        if (is_trail(*it)) {
+                            cp += (*it) & 0x3f;
+
+                            if (code_point)
+                                *code_point = cp;
+                            ret_code = UTF8_OK;
+                        }
+                        else 
+                            ret_code = INCOMPLETE_SEQUENCE;
+                    }
+                    else
+                        ret_code = NOT_ENOUGH_ROOM;
+                }
+                else
+                    ret_code = INCOMPLETE_SEQUENCE;
+            }
+            else
+                ret_code = NOT_ENOUGH_ROOM;
+        }
+
+        return ret_code;
+    }
+
+    template <typename octet_iterator>
+    utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point)
+    {
+        utf_error ret_code = NOT_ENOUGH_ROOM;
+
+        if (it != end) {
+            uint32_t cp = mask8(*it);
+            if (++it != end) {
+                if (is_trail(*it)) {
+                    cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
+                    if (++it != end) {
+                        if (is_trail(*it)) {
+                            cp += (mask8(*it) << 6) & 0xfff;
+                            if (++it != end) {
+                                if (is_trail(*it)) {
+                                    cp += (*it) & 0x3f;
+
+                                    if (code_point)
+                                        *code_point = cp;
+                                    ret_code = UTF8_OK;
+                                }
+                                else
+                                    ret_code = INCOMPLETE_SEQUENCE;
+                            }
+                            else
+                                ret_code = NOT_ENOUGH_ROOM;
+                        }
+                        else
+                            ret_code = INCOMPLETE_SEQUENCE;
+                    }
+                    else
+                        ret_code = NOT_ENOUGH_ROOM;
+                }
+                else 
+                    ret_code = INCOMPLETE_SEQUENCE;
+            }
+            else
+                ret_code = NOT_ENOUGH_ROOM;
+        }
+
+        return ret_code;
+    }

    template <typename octet_iterator>
    utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
    {
-        uint32_t cp = mask8(*it);
-        // Check the lead octet
+        // Save the original value of it so we can go back in case of failure
+        // Of course, it does not make much sense with i.e. stream iterators
+        octet_iterator original_it = it;
+
+        uint32_t cp = 0;
+        // Determine the sequence length based on the lead octet
        typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
        octet_difference_type length = sequence_length(it);
+        if (length == 0)
+            return INVALID_LEAD;

-        // "Shortcut" for ASCII characters
-        if (length == 1) {
-            if (end - it > 0) {
-                if (code_point)
-                    *code_point = cp;
-                ++it;
-                return OK;
-            }
-            else
-                return NOT_ENOUGH_ROOM;
-        }
-
-        // Do we have enough memory?     
-        if (std::distance(it, end) < length)
-            return NOT_ENOUGH_ROOM;
-        
-        // Check trail octets and calculate the code point
+        // Now that we have a valid sequence length, get trail octets and calculate the code point
+        utf_error err = UTF8_OK;
        switch (length) {
-            case 0:
-                return INVALID_LEAD;
+            case 1:
+                err = get_sequence_1(it, end, &cp);
                break;
            case 2:
-                if (is_trail(*(++it))) { 
-                    cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
-                }
-                else {
-                    --it;
-                    return INCOMPLETE_SEQUENCE;
-                }
+                err = get_sequence_2(it, end, &cp);
            break;
            case 3:
-                if (is_trail(*(++it))) {
-                    cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
-                    if (is_trail(*(++it))) {
-                        cp += (*it) & 0x3f;
-                    }
-                    else {
-                        std::advance(it, -2);
-                        return INCOMPLETE_SEQUENCE;
-                    }
-                }
-                else {
-                    --it;
-                    return INCOMPLETE_SEQUENCE;
-                }
+                err = get_sequence_3(it, end, &cp);
            break;
            case 4:
-                if (is_trail(*(++it))) {
-                    cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);                
-                    if (is_trail(*(++it))) {
-                        cp += (mask8(*it) << 6) & 0xfff;
-                        if (is_trail(*(++it))) {
-                            cp += (*it) & 0x3f; 
-                        }
-                        else {
-                            std::advance(it, -3);
-                            return INCOMPLETE_SEQUENCE;
-                        }
-                    }
-                    else {
-                        std::advance(it, -2);
-                        return INCOMPLETE_SEQUENCE;
-                    }
-                }
-                else {
-                    --it;
-                    return INCOMPLETE_SEQUENCE;
-                }
+                err = get_sequence_4(it, end, &cp);
            break;
        }
-        // Is the code point valid?
-        if (!is_code_point_valid(cp)) {
-            for (octet_difference_type i = 0; i < length - 1; ++i) 
-                --it;
-            return INVALID_CODE_POINT;
-        }
-            
-        if (code_point)
-            *code_point = cp;
-            
-        if (cp < 0x80) {
-            if (length != 1) {
-                std::advance(it, -(length-1));
-                return OVERLONG_SEQUENCE;
+
+        if (err == UTF8_OK) {
+            // Decoding succeeded. Now, security checks...
+            if (is_code_point_valid(cp)) {
+                if (!is_overlong_sequence(cp, length)){
+                    // Passed! Return here.
+                    if (code_point)
+                        *code_point = cp;
+                    ++it;
+                    return UTF8_OK;
+                }
+                else
+                    err = OVERLONG_SEQUENCE;
            }
+            else 
+                err = INVALID_CODE_POINT;
        }
-        else if (cp < 0x800) {
-            if (length != 2) {
-                std::advance(it, -(length-1));
-                return OVERLONG_SEQUENCE;
-            }
-        }
-        else if (cp < 0x10000) {
-            if (length != 3) {
-                std::advance(it, -(length-1));
-                return OVERLONG_SEQUENCE;
-            }
-        }
-           
-        ++it;
-        return OK;    
+
+        // Failure branch - restore the original value of the iterator
+        it = original_it;
+        return err;
    }

    template <typename octet_iterator>
@ -227,12 +305,12 @@ namespace internal
        return validate_next(it, end, 0);
    }

-} // namespace internal 
+} // namespace internal

    /// The library API - functions intended to be called by the users

    // Byte order mark
-    const uint8_t bom[] = {0xef, 0xbb, 0xbf}; 
+    const uint8_t bom[] = {0xef, 0xbb, 0xbf};

    template <typename octet_iterator>
    octet_iterator find_invalid(octet_iterator start, octet_iterator end)
@ -240,7 +318,7 @@ namespace internal
        octet_iterator result = start;
        while (result != end) {
            internal::utf_error err_code = internal::validate_next(result, end);
-            if (err_code != internal::OK)
+            if (err_code != internal::UTF8_OK)
                return result;
        }
        return result;
--- a/dep/include/utf8cpp/utf8/unchecked.h
+++ b/dep/include/utf8cpp/utf8/unchecked.h
@ -45,13 +45,13 @@ namespace utf8
            }
            else if (cp < 0x10000) {              // three octets
                *(result++) = static_cast<uint8_t>((cp >> 12)         | 0xe0);
-                *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f   | 0x80);
+                *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
                *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
            }
            else {                                // four octets
                *(result++) = static_cast<uint8_t>((cp >> 18)         | 0xf0);
-                *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f   | 0x80);
-                *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f   | 0x80);
+                *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
+                *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
                *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
            }
            return result;
@ -132,7 +132,7 @@ namespace utf8
            while (start != end) {
                uint32_t cp = internal::mask16(*start++);
            // Take care of surrogate pairs first
-                if (internal::is_surrogate(cp)) {
+                if (internal::is_lead_surrogate(cp)) {
                    uint32_t trail_surrogate = internal::mask16(*start++);
                    cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
                }