JSON-94: re-implement combining of UTF-16 surrogate pairs. the 1st implementation was nuts. :-9 JSON-93
authorRoker <roker@pep-project.org>
Thu, 07 Jun 2018 12:44:11 +0200
branchJSON-93
changeset 537ff1cdc290c32
parent 536 b3f80037ebca
child 538 bd0799df842f
JSON-94: re-implement combining of UTF-16 surrogate pairs. the 1st implementation was nuts. :-9
server/json_spirit/json_spirit_reader_template.h
     1.1 --- a/server/json_spirit/json_spirit_reader_template.h	Thu Jun 07 12:42:43 2018 +0200
     1.2 +++ b/server/json_spirit/json_spirit_reader_template.h	Thu Jun 07 12:44:11 2018 +0200
     1.3 @@ -59,7 +59,7 @@
     1.4          if( ( c >= '0' ) && ( c <= '9' ) ) return c - '0';
     1.5          if( ( c >= 'a' ) && ( c <= 'f' ) ) return c - 'a' + 10;
     1.6          if( ( c >= 'A' ) && ( c <= 'F' ) ) return c - 'A' + 10;
     1.7 -        return 0;
     1.8 +        throw std::runtime_error(std::string("Char \"") + char(c) + "\" is not a hex digit!");
     1.9      }
    1.10  
    1.11      template< class Char_type, class Iter_type >
    1.12 @@ -104,7 +104,6 @@
    1.13      {
    1.14          typedef typename String_type::value_type Char_type;
    1.15          
    1.16 -        unsigned high_surrogate = 0;
    1.17          const Char_type c2( *begin );
    1.18  
    1.19          switch( c2 )
    1.20 @@ -136,23 +135,27 @@
    1.21                      }else{
    1.22                          if(c>=0xD800 && c<=0xDBFF) // high surrogate from UTF-16 pair
    1.23                          {
    1.24 -                            high_surrogate = c;
    1.25 -                        }else if(c>=0xDC00 && c<=0xDFFF) // low surrogate from UTF-16 pair
    1.26 +                            const unsigned high_surrogate = c;
    1.27 +                            if(end-begin<7)
    1.28 +                                throw std::runtime_error("Missing low surrogate at end of string. E0");
    1.29 +                            
    1.30 +                            if(*++begin != '\\')
    1.31 +                                throw std::runtime_error("Missing low surrogate at end of string. E1");
    1.32 +                            
    1.33 +                            if(*++begin != 'u')
    1.34 +                                throw std::runtime_error("Missing low surrogate at end of string. E2");
    1.35 +                            
    1.36 +                            const unsigned low_surrogate = unicode_str_to_char< Char_type >( begin );
    1.37 +                            if( (low_surrogate < 0xDC00) || (low_surrogate > 0xDFFF) )
    1.38 +                                throw std::runtime_error("Missing low surrogate at end of string. E3");
    1.39 +                            
    1.40 +                            // combine the two escaped \u sequences into one Non-BMP character:
    1.41 +                            const unsigned u32 = (high_surrogate-0xD800) * 1024 + (low_surrogate-0xDC00) + 0x10000;
    1.42 +                            s += encode_utf<String_type>(u32);
    1.43 +                        }else if(c>=0xDC00 && c<=0xDFFF)
    1.44                          {
    1.45 -                            if(high_surrogate)
    1.46 -                            {
    1.47 -                                // combine the two escaped \u sequences into one Non-BMP character:
    1.48 -                                const unsigned u32 = (high_surrogate-0xD800) * 1024 + (c-0xDC00) + 0x10000;
    1.49 -                                s += encode_utf<String_type>(u32);
    1.50 -                                high_surrogate = 0;
    1.51 -                            }else{
    1.52 -                                throw std::runtime_error("Escaped low surrogate without high surrogate before!");
    1.53 -                            }
    1.54 +                            throw std::runtime_error("Unexpected low surrogate.");
    1.55                          }else{
    1.56 -                            if(high_surrogate)
    1.57 -                            {
    1.58 -                                throw std::runtime_error("Escaped high surrogate without following low surrogate!");
    1.59 -                            }
    1.60                              s += encode_utf<String_type>(c); // normal \u escaped BMP character.
    1.61                          }
    1.62                      }
    1.63 @@ -161,7 +164,7 @@
    1.64              }
    1.65              default :
    1.66              {
    1.67 -                throw std::runtime_error("Unknown char \"" + c2 + "\" after backslash.");
    1.68 +                throw std::runtime_error(std::string("Unknown char \"") + char(c2) + "\" after backslash.");
    1.69              }
    1.70          }
    1.71      }