LCOV - CPython lcov report - Objects/unicodeobject.c

LCOV - code coverage report

Current view:	top level - Objects - unicodeobject.c (source / functions)		Hit	Total	Coverage
Test:	CPython lcov report	Lines:	132	4009	3.3 %
Date:	2017-04-19	Functions:	14	182	7.7 %

          Line data    Source code

       1             : /*
       2             : 
       3             : Unicode implementation based on original code by Fredrik Lundh,
       4             : modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
       5             : Unicode Integration Proposal (see file Misc/unicode.txt).
       6             : 
       7             : Major speed upgrades to the method implementations at the Reykjavik
       8             : NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
       9             : 
      10             : Copyright (c) Corporation for National Research Initiatives.
      11             : 
      12             : --------------------------------------------------------------------
      13             : The original string type implementation is:
      14             : 
      15             :   Copyright (c) 1999 by Secret Labs AB
      16             :   Copyright (c) 1999 by Fredrik Lundh
      17             : 
      18             : By obtaining, using, and/or copying this software and/or its
      19             : associated documentation, you agree that you have read, understood,
      20             : and will comply with the following terms and conditions:
      21             : 
      22             : Permission to use, copy, modify, and distribute this software and its
      23             : associated documentation for any purpose and without fee is hereby
      24             : granted, provided that the above copyright notice appears in all
      25             : copies, and that both that copyright notice and this permission notice
      26             : appear in supporting documentation, and that the name of Secret Labs
      27             : AB or the author not be used in advertising or publicity pertaining to
      28             : distribution of the software without specific, written prior
      29             : permission.
      30             : 
      31             : SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
      32             : THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
      33             : FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
      34             : ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      35             : WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
      36             : ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
      37             : OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
      38             : --------------------------------------------------------------------
      39             : 
      40             : */
      41             : 
      42             : #define PY_SSIZE_T_CLEAN
      43             : #include "Python.h"
      44             : 
      45             : #include "unicodeobject.h"
      46             : #include "ucnhash.h"
      47             : 
      48             : #ifdef MS_WINDOWS
      49             : #include <windows.h>
      50             : #endif
      51             : 
      52             : /* Limit for the Unicode object free list */
      53             : 
      54             : #define PyUnicode_MAXFREELIST       1024
      55             : 
      56             : /* Limit for the Unicode object free list stay alive optimization.
      57             : 
      58             :    The implementation will keep allocated Unicode memory intact for
      59             :    all objects on the free list having a size less than this
      60             :    limit. This reduces malloc() overhead for small Unicode objects.
      61             : 
      62             :    At worst this will result in PyUnicode_MAXFREELIST *
      63             :    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
      64             :    malloc()-overhead) bytes of unused garbage.
      65             : 
      66             :    Setting the limit to 0 effectively turns the feature off.
      67             : 
      68             :    Note: This is an experimental feature ! If you get core dumps when
      69             :    using Unicode objects, turn this feature off.
      70             : 
      71             : */
      72             : 
      73             : #define KEEPALIVE_SIZE_LIMIT       9
      74             : 
      75             : /* Endianness switches; defaults to little endian */
      76             : 
      77             : #ifdef WORDS_BIGENDIAN
      78             : # define BYTEORDER_IS_BIG_ENDIAN
      79             : #else
      80             : # define BYTEORDER_IS_LITTLE_ENDIAN
      81             : #endif
      82             : 
      83             : /* --- Globals ------------------------------------------------------------
      84             : 
      85             : NOTE: In the interpreter's initialization phase, some globals are currently
      86             :       initialized dynamically as needed. In the process Unicode objects may
      87             :       be created before the Unicode type is ready.
      88             : 
      89             : */
      90             : 
      91             : 
      92             : #ifdef __cplusplus
      93             : extern "C" {
      94             : #endif
      95             : 
      96             : /* Free list for Unicode objects */
      97             : static PyUnicodeObject *free_list = NULL;
      98             : static int numfree = 0;
      99             : 
     100             : /* The empty Unicode object is shared to improve performance. */
     101             : static PyUnicodeObject *unicode_empty = NULL;
     102             : 
     103             : #define _Py_RETURN_UNICODE_EMPTY()                      \
     104             :     do {                                                \
     105             :         if (unicode_empty != NULL)                      \
     106             :             Py_INCREF(unicode_empty);                   \
     107             :         else {                                          \
     108             :             unicode_empty = _PyUnicode_New(0);          \
     109             :             if (unicode_empty != NULL)                  \
     110             :                 Py_INCREF(unicode_empty);               \
     111             :         }                                               \
     112             :         return (PyObject *)unicode_empty;               \
     113             :     } while (0)
     114             : 
     115             : /* Single character Unicode strings in the Latin-1 range are being
     116             :    shared as well. */
     117             : static PyUnicodeObject *unicode_latin1[256] = {NULL};
     118             : 
     119             : /* Default encoding to use and assume when NULL is passed as encoding
     120             :    parameter; it is initialized by _PyUnicode_Init().
     121             : 
     122             :    Always use the PyUnicode_SetDefaultEncoding() and
     123             :    PyUnicode_GetDefaultEncoding() APIs to access this global.
     124             : 
     125             : */
     126             : static char unicode_default_encoding[100 + 1] = "ascii";
     127             : 
     128             : /* Fast detection of the most frequent whitespace characters */
     129             : const unsigned char _Py_ascii_whitespace[] = {
     130             :     0, 0, 0, 0, 0, 0, 0, 0,
     131             : /*     case 0x0009: * CHARACTER TABULATION */
     132             : /*     case 0x000A: * LINE FEED */
     133             : /*     case 0x000B: * LINE TABULATION */
     134             : /*     case 0x000C: * FORM FEED */
     135             : /*     case 0x000D: * CARRIAGE RETURN */
     136             :     0, 1, 1, 1, 1, 1, 0, 0,
     137             :     0, 0, 0, 0, 0, 0, 0, 0,
     138             : /*     case 0x001C: * FILE SEPARATOR */
     139             : /*     case 0x001D: * GROUP SEPARATOR */
     140             : /*     case 0x001E: * RECORD SEPARATOR */
     141             : /*     case 0x001F: * UNIT SEPARATOR */
     142             :     0, 0, 0, 0, 1, 1, 1, 1,
     143             : /*     case 0x0020: * SPACE */
     144             :     1, 0, 0, 0, 0, 0, 0, 0,
     145             :     0, 0, 0, 0, 0, 0, 0, 0,
     146             :     0, 0, 0, 0, 0, 0, 0, 0,
     147             :     0, 0, 0, 0, 0, 0, 0, 0,
     148             : 
     149             :     0, 0, 0, 0, 0, 0, 0, 0,
     150             :     0, 0, 0, 0, 0, 0, 0, 0,
     151             :     0, 0, 0, 0, 0, 0, 0, 0,
     152             :     0, 0, 0, 0, 0, 0, 0, 0,
     153             :     0, 0, 0, 0, 0, 0, 0, 0,
     154             :     0, 0, 0, 0, 0, 0, 0, 0,
     155             :     0, 0, 0, 0, 0, 0, 0, 0,
     156             :     0, 0, 0, 0, 0, 0, 0, 0
     157             : };
     158             : 
     159             : /* Same for linebreaks */
     160             : static unsigned char ascii_linebreak[] = {
     161             :     0, 0, 0, 0, 0, 0, 0, 0,
     162             : /*         0x000A, * LINE FEED */
     163             : /*         0x000B, * LINE TABULATION */
     164             : /*         0x000C, * FORM FEED */
     165             : /*         0x000D, * CARRIAGE RETURN */
     166             :     0, 0, 1, 1, 1, 1, 0, 0,
     167             :     0, 0, 0, 0, 0, 0, 0, 0,
     168             : /*         0x001C, * FILE SEPARATOR */
     169             : /*         0x001D, * GROUP SEPARATOR */
     170             : /*         0x001E, * RECORD SEPARATOR */
     171             :     0, 0, 0, 0, 1, 1, 1, 0,
     172             :     0, 0, 0, 0, 0, 0, 0, 0,
     173             :     0, 0, 0, 0, 0, 0, 0, 0,
     174             :     0, 0, 0, 0, 0, 0, 0, 0,
     175             :     0, 0, 0, 0, 0, 0, 0, 0,
     176             : 
     177             :     0, 0, 0, 0, 0, 0, 0, 0,
     178             :     0, 0, 0, 0, 0, 0, 0, 0,
     179             :     0, 0, 0, 0, 0, 0, 0, 0,
     180             :     0, 0, 0, 0, 0, 0, 0, 0,
     181             :     0, 0, 0, 0, 0, 0, 0, 0,
     182             :     0, 0, 0, 0, 0, 0, 0, 0,
     183             :     0, 0, 0, 0, 0, 0, 0, 0,
     184             :     0, 0, 0, 0, 0, 0, 0, 0
     185             : };
     186             : 
     187             : 
     188             : Py_UNICODE
     189           3 : PyUnicode_GetMax(void)
     190             : {
     191             : #ifdef Py_UNICODE_WIDE
     192             :     return 0x10FFFF;
     193             : #else
     194             :     /* This is actually an illegal character, so it should
     195             :        not be passed to unichr. */
     196           3 :     return 0xFFFF;
     197             : #endif
     198             : }
     199             : 
     200             : /* --- Bloom Filters ----------------------------------------------------- */
     201             : 
     202             : /* stuff to implement simple "bloom filters" for Unicode characters.
     203             :    to keep things simple, we use a single bitmask, using the least 5
     204             :    bits from each unicode characters as the bit index. */
     205             : 
     206             : /* the linebreak mask is set up by Unicode_Init below */
     207             : 
     208             : #if LONG_BIT >= 128
     209             : #define BLOOM_WIDTH 128
     210             : #elif LONG_BIT >= 64
     211             : #define BLOOM_WIDTH 64
     212             : #elif LONG_BIT >= 32
     213             : #define BLOOM_WIDTH 32
     214             : #else
     215             : #error "LONG_BIT is smaller than 32"
     216             : #endif
     217             : 
     218             : #define BLOOM_MASK unsigned long
     219             : 
     220             : static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
     221             : 
     222             : #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
     223             : #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
     224             : 
     225             : #define BLOOM_LINEBREAK(ch)                                             \
     226             :     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
     227             :      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
     228             : 
     229           3 : Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
     230             : {
     231             :     /* calculate simple bloom-style bitmask for a given unicode string */
     232             : 
     233             :     BLOOM_MASK mask;
     234             :     Py_ssize_t i;
     235             : 
     236           3 :     mask = 0;
     237          27 :     for (i = 0; i < len; i++)
     238          24 :         BLOOM_ADD(mask, ptr[i]);
     239             : 
     240           3 :     return mask;
     241             : }
     242             : 
     243           0 : Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
     244             : {
     245             :     Py_ssize_t i;
     246             : 
     247           0 :     for (i = 0; i < setlen; i++)
     248           0 :         if (set[i] == chr)
     249           0 :             return 1;
     250             : 
     251           0 :     return 0;
     252             : }
     253             : 
     254             : #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
     255             :     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
     256             : 
     257             : /* --- Unicode Object ----------------------------------------------------- */
     258             : 
     259             : static
     260          42 : int unicode_resize(register PyUnicodeObject *unicode,
     261             :                    Py_ssize_t length)
     262             : {
     263             :     void *oldstr;
     264             : 
     265             :     /* Shortcut if there's nothing much to do. */
     266          42 :     if (unicode->length == length)
     267          42 :         goto reset;
     268             : 
     269             :     /* Resizing shared object (unicode_empty or single character
     270             :        objects) in-place is not allowed. Use PyUnicode_Resize()
     271             :        instead ! */
     272             : 
     273           0 :     if (unicode == unicode_empty ||
     274           0 :         (unicode->length == 1 &&
     275           0 :          unicode->str[0] < 256U &&
     276           0 :          unicode_latin1[unicode->str[0]] == unicode)) {
     277           0 :         PyErr_SetString(PyExc_SystemError,
     278             :                         "can't resize shared unicode objects");
     279           0 :         return -1;
     280             :     }
     281             : 
     282             :     /* We allocate one more byte to make sure the string is Ux0000 terminated.
     283             :        The overallocation is also used by fastsearch, which assumes that it's
     284             :        safe to look at str[length] (without making any assumptions about what
     285             :        it contains). */
     286             : 
     287           0 :     oldstr = unicode->str;
     288           0 :     unicode->str = PyObject_REALLOC(unicode->str,
     289             :                                     sizeof(Py_UNICODE) * (length + 1));
     290           0 :     if (!unicode->str) {
     291           0 :         unicode->str = (Py_UNICODE *)oldstr;
     292           0 :         PyErr_NoMemory();
     293           0 :         return -1;
     294             :     }
     295           0 :     unicode->str[length] = 0;
     296           0 :     unicode->length = length;
     297             : 
     298             :   reset:
     299             :     /* Reset the object caches */
     300          42 :     if (unicode->defenc) {
     301           0 :         Py_CLEAR(unicode->defenc);
     302             :     }
     303          42 :     unicode->hash = -1;
     304             : 
     305          42 :     return 0;
     306             : }
     307             : 
     308             : /* We allocate one more byte to make sure the string is
     309             :    Ux0000 terminated; some code relies on that.
     310             : 
     311             :    XXX This allocator could further be enhanced by assuring that the
     312             :    free list never reduces its size below 1.
     313             : 
     314             : */
     315             : 
     316             : static
     317          54 : PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
     318             : {
     319             :     register PyUnicodeObject *unicode;
     320             : 
     321             :     /* Optimization for empty strings */
     322          54 :     if (length == 0 && unicode_empty != NULL) {
     323           9 :         Py_INCREF(unicode_empty);
     324           9 :         return unicode_empty;
     325             :     }
     326             : 
     327             :     /* Ensure we won't overflow the size. */
     328          45 :     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
     329           0 :         return (PyUnicodeObject *)PyErr_NoMemory();
     330             :     }
     331             : 
     332             :     /* Unicode freelist & memory allocation */
     333          45 :     if (free_list) {
     334           3 :         unicode = free_list;
     335           3 :         free_list = *(PyUnicodeObject **)unicode;
     336           3 :         numfree--;
     337           3 :         if (unicode->str) {
     338             :             /* Keep-Alive optimization: we only upsize the buffer,
     339             :                never downsize it. */
     340           3 :             if ((unicode->length < length) &&
     341           0 :                 unicode_resize(unicode, length) < 0) {
     342           0 :                 PyObject_DEL(unicode->str);
     343           0 :                 unicode->str = NULL;
     344             :             }
     345             :         }
     346             :         else {
     347           0 :             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
     348           0 :             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
     349             :         }
     350           3 :         (void)PyObject_INIT(unicode, &PyUnicode_Type);
     351             :     }
     352             :     else {
     353             :         size_t new_size;
     354          42 :         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
     355          42 :         if (unicode == NULL)
     356           0 :             return NULL;
     357          42 :         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
     358          42 :         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
     359             :     }
     360             : 
     361          45 :     if (!unicode->str) {
     362           0 :         PyErr_NoMemory();
     363           0 :         goto onError;
     364             :     }
     365             :     /* Initialize the first element to guard against cases where
     366             :      * the caller fails before initializing str -- unicode_resize()
     367             :      * reads str[0], and the Keep-Alive optimization can keep memory
     368             :      * allocated for str alive across a call to unicode_dealloc(unicode).
     369             :      * We don't want unicode_resize to read uninitialized memory in
     370             :      * that case.
     371             :      */
     372          45 :     unicode->str[0] = 0;
     373          45 :     unicode->str[length] = 0;
     374          45 :     unicode->length = length;
     375          45 :     unicode->hash = -1;
     376          45 :     unicode->defenc = NULL;
     377          45 :     return unicode;
     378             : 
     379             :   onError:
     380             :     /* XXX UNREF/NEWREF interface should be more symmetrical */
     381             :     _Py_DEC_REFTOTAL;
     382             :     _Py_ForgetReference((PyObject *)unicode);
     383           0 :     PyObject_Del(unicode);
     384           0 :     return NULL;
     385             : }
     386             : 
     387             : static
     388          42 : void unicode_dealloc(register PyUnicodeObject *unicode)
     389             : {
     390          84 :     if (PyUnicode_CheckExact(unicode) &&
     391          42 :         numfree < PyUnicode_MAXFREELIST) {
     392             :         /* Keep-Alive optimization */
     393          42 :         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
     394           3 :             PyObject_DEL(unicode->str);
     395           3 :             unicode->str = NULL;
     396           3 :             unicode->length = 0;
     397             :         }
     398          42 :         if (unicode->defenc) {
     399           0 :             Py_CLEAR(unicode->defenc);
     400             :         }
     401             :         /* Add to free list */
     402          42 :         *(PyUnicodeObject **)unicode = free_list;
     403          42 :         free_list = unicode;
     404          42 :         numfree++;
     405             :     }
     406             :     else {
     407           0 :         PyObject_DEL(unicode->str);
     408           0 :         Py_XDECREF(unicode->defenc);
     409           0 :         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
     410             :     }
     411          42 : }
     412             : 
     413             : static
     414          42 : int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
     415             : {
     416             :     register PyUnicodeObject *v;
     417             : 
     418             :     /* Argument checks */
     419          42 :     if (unicode == NULL) {
     420           0 :         PyErr_BadInternalCall();
     421           0 :         return -1;
     422             :     }
     423          42 :     v = *unicode;
     424          42 :     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
     425           0 :         PyErr_BadInternalCall();
     426           0 :         return -1;
     427             :     }
     428             : 
     429             :     /* Resizing unicode_empty and single character objects is not
     430             :        possible since these are being shared. We simply return a fresh
     431             :        copy with the same Unicode content. */
     432          42 :     if (v->length != length &&
     433           0 :         (v == unicode_empty || v->length == 1)) {
     434           0 :         PyUnicodeObject *w = _PyUnicode_New(length);
     435           0 :         if (w == NULL)
     436           0 :             return -1;
     437           0 :         Py_UNICODE_COPY(w->str, v->str,
     438             :                         length < v->length ? length : v->length);
     439           0 :         Py_SETREF(*unicode, w);
     440           0 :         return 0;
     441             :     }
     442             : 
     443             :     /* Note that we don't have to modify *unicode for unshared Unicode
     444             :        objects, since we can modify them in-place. */
     445          42 :     return unicode_resize(v, length);
     446             : }
     447             : 
     448           0 : int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
     449             : {
     450           0 :     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
     451             : }
     452             : 
     453           0 : PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
     454             :                                 Py_ssize_t size)
     455             : {
     456             :     PyUnicodeObject *unicode;
     457             : 
     458             :     /* If the Unicode data is known at construction time, we can apply
     459             :        some optimizations which share commonly used objects. */
     460           0 :     if (u != NULL) {
     461             : 
     462             :         /* Optimization for empty strings */
     463           0 :         if (size == 0)
     464           0 :             _Py_RETURN_UNICODE_EMPTY();
     465             : 
     466             :         /* Single character Unicode objects in the Latin-1 range are
     467             :            shared when using this constructor */
     468           0 :         if (size == 1 && *u < 256) {
     469           0 :             unicode = unicode_latin1[*u];
     470           0 :             if (!unicode) {
     471           0 :                 unicode = _PyUnicode_New(1);
     472           0 :                 if (!unicode)
     473           0 :                     return NULL;
     474           0 :                 unicode->str[0] = *u;
     475           0 :                 unicode_latin1[*u] = unicode;
     476             :             }
     477           0 :             Py_INCREF(unicode);
     478           0 :             return (PyObject *)unicode;
     479             :         }
     480             :     }
     481             : 
     482           0 :     unicode = _PyUnicode_New(size);
     483           0 :     if (!unicode)
     484           0 :         return NULL;
     485             : 
     486             :     /* Copy the Unicode data into the new object */
     487           0 :     if (u != NULL)
     488           0 :         Py_UNICODE_COPY(unicode->str, u, size);
     489             : 
     490           0 :     return (PyObject *)unicode;
     491             : }
     492             : 
     493           3 : PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
     494             : {
     495             :     PyUnicodeObject *unicode;
     496             : 
     497           3 :     if (size < 0) {
     498           0 :         PyErr_SetString(PyExc_SystemError,
     499             :                         "Negative size passed to PyUnicode_FromStringAndSize");
     500           0 :         return NULL;
     501             :     }
     502             : 
     503             :     /* If the Unicode data is known at construction time, we can apply
     504             :        some optimizations which share commonly used objects.
     505             :        Also, this means the input must be UTF-8, so fall back to the
     506             :        UTF-8 decoder at the end. */
     507           3 :     if (u != NULL) {
     508             : 
     509             :         /* Optimization for empty strings */
     510           0 :         if (size == 0)
     511           0 :             _Py_RETURN_UNICODE_EMPTY();
     512             : 
     513             :         /* Single characters are shared when using this constructor.
     514             :            Restrict to ASCII, since the input must be UTF-8. */
     515           0 :         if (size == 1 && Py_CHARMASK(*u) < 128) {
     516           0 :             unicode = unicode_latin1[Py_CHARMASK(*u)];
     517           0 :             if (!unicode) {
     518           0 :                 unicode = _PyUnicode_New(1);
     519           0 :                 if (!unicode)
     520           0 :                     return NULL;
     521           0 :                 unicode->str[0] = Py_CHARMASK(*u);
     522           0 :                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
     523             :             }
     524           0 :             Py_INCREF(unicode);
     525           0 :             return (PyObject *)unicode;
     526             :         }
     527             : 
     528           0 :         return PyUnicode_DecodeUTF8(u, size, NULL);
     529             :     }
     530             : 
     531           3 :     unicode = _PyUnicode_New(size);
     532           3 :     if (!unicode)
     533           0 :         return NULL;
     534             : 
     535           3 :     return (PyObject *)unicode;
     536             : }
     537             : 
     538           0 : PyObject *PyUnicode_FromString(const char *u)
     539             : {
     540           0 :     size_t size = strlen(u);
     541           0 :     if (size > PY_SSIZE_T_MAX) {
     542           0 :         PyErr_SetString(PyExc_OverflowError, "input too long");
     543           0 :         return NULL;
     544             :     }
     545             : 
     546           0 :     return PyUnicode_FromStringAndSize(u, size);
     547             : }
     548             : 
     549             : /* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
     550             :  * by 'ptr', possibly combining surrogate pairs on narrow builds.
     551             :  * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
     552             :  * that should be returned and 'end' pointing to the end of the buffer.
     553             :  * ('end' is used on narrow builds to detect a lone surrogate at the
     554             :  * end of the buffer that should be returned unchanged.)
     555             :  * The ptr and end arguments should be side-effect free and ptr must an lvalue.
     556             :  * The type of the returned char is always Py_UCS4.
     557             :  *
     558             :  * Note: the macro advances ptr to next char, so it might have side-effects
     559             :  *       (especially if used with other macros).
     560             :  */
     561             : 
     562             : /* helper macros used by _Py_UNICODE_NEXT */
     563             : #define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
     564             : #define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
     565             : /* Join two surrogate characters and return a single Py_UCS4 value. */
     566             : #define _Py_UNICODE_JOIN_SURROGATES(high, low)  \
     567             :     (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
     568             :       ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
     569             : 
     570             : #ifdef Py_UNICODE_WIDE
     571             : #define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
     572             : #else
     573             : #define _Py_UNICODE_NEXT(ptr, end)                                      \
     574             :      (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) &&      \
     575             :         _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ?                       \
     576             :        ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
     577             :        (Py_UCS4)*(ptr)++)
     578             : #endif
     579             : 
     580             : #ifdef HAVE_WCHAR_H
     581             : 
     582             : #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
     583             : # define CONVERT_WCHAR_TO_SURROGATES
     584             : #endif
     585             : 
     586             : #ifdef CONVERT_WCHAR_TO_SURROGATES
     587             : 
     588             : /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
     589             :    to convert from UTF32 to UTF16. */
     590             : 
     591           0 : PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
     592             :                                  Py_ssize_t size)
     593             : {
     594             :     PyUnicodeObject *unicode;
     595             :     register Py_ssize_t i;
     596             :     Py_ssize_t alloc;
     597             :     const wchar_t *orig_w;
     598             : 
     599           0 :     if (w == NULL) {
     600           0 :         PyErr_BadInternalCall();
     601           0 :         return NULL;
     602             :     }
     603             : 
     604           0 :     alloc = size;
     605           0 :     orig_w = w;
     606           0 :     for (i = size; i > 0; i--) {
     607           0 :         if (*w > 0xFFFF)
     608           0 :             alloc++;
     609           0 :         w++;
     610             :     }
     611           0 :     w = orig_w;
     612           0 :     unicode = _PyUnicode_New(alloc);
     613           0 :     if (!unicode)
     614           0 :         return NULL;
     615             : 
     616             :     /* Copy the wchar_t data into the new object */
     617             :     {
     618             :         register Py_UNICODE *u;
     619           0 :         u = PyUnicode_AS_UNICODE(unicode);
     620           0 :         for (i = size; i > 0; i--) {
     621           0 :             if (*w > 0xFFFF) {
     622           0 :                 wchar_t ordinal = *w++;
     623           0 :                 ordinal -= 0x10000;
     624           0 :                 *u++ = 0xD800 | (ordinal >> 10);
     625           0 :                 *u++ = 0xDC00 | (ordinal & 0x3FF);
     626             :             }
     627             :             else
     628           0 :                 *u++ = *w++;
     629             :         }
     630             :     }
     631           0 :     return (PyObject *)unicode;
     632             : }
     633             : 
     634             : #else
     635             : 
     636             : PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
     637             :                                  Py_ssize_t size)
     638             : {
     639             :     PyUnicodeObject *unicode;
     640             : 
     641             :     if (w == NULL) {
     642             :         PyErr_BadInternalCall();
     643             :         return NULL;
     644             :     }
     645             : 
     646             :     unicode = _PyUnicode_New(size);
     647             :     if (!unicode)
     648             :         return NULL;
     649             : 
     650             :     /* Copy the wchar_t data into the new object */
     651             : #ifdef HAVE_USABLE_WCHAR_T
     652             :     memcpy(unicode->str, w, size * sizeof(wchar_t));
     653             : #else
     654             :     {
     655             :         register Py_UNICODE *u;
     656             :         register Py_ssize_t i;
     657             :         u = PyUnicode_AS_UNICODE(unicode);
     658             :         for (i = size; i > 0; i--)
     659             :             *u++ = *w++;
     660             :     }
     661             : #endif
     662             : 
     663             :     return (PyObject *)unicode;
     664             : }
     665             : 
     666             : #endif /* CONVERT_WCHAR_TO_SURROGATES */
     667             : 
     668             : #undef CONVERT_WCHAR_TO_SURROGATES
     669             : 
     670             : static void
     671           0 : makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
     672             : {
     673           0 :     *fmt++ = '%';
     674           0 :     if (width) {
     675           0 :         if (zeropad)
     676           0 :             *fmt++ = '0';
     677           0 :         fmt += sprintf(fmt, "%d", width);
     678             :     }
     679           0 :     if (precision)
     680           0 :         fmt += sprintf(fmt, ".%d", precision);
     681           0 :     if (longflag)
     682           0 :         *fmt++ = 'l';
     683           0 :     else if (size_tflag) {
     684           0 :         char *f = PY_FORMAT_SIZE_T;
     685           0 :         while (*f)
     686           0 :             *fmt++ = *f++;
     687             :     }
     688           0 :     *fmt++ = c;
     689           0 :     *fmt = '\0';
     690           0 : }
     691             : 
     692             : #define appendstring(string) \
     693             :     do { \
     694             :         for (copy = string;*copy; copy++) { \
     695             :             *s++ = (unsigned char)*copy; \
     696             :         } \
     697             :     } while (0)
     698             : 
     699             : PyObject *
     700           0 : PyUnicode_FromFormatV(const char *format, va_list vargs)
     701             : {
     702             :     va_list count;
     703           0 :     Py_ssize_t callcount = 0;
     704           0 :     PyObject **callresults = NULL;
     705           0 :     PyObject **callresult = NULL;
     706           0 :     Py_ssize_t n = 0;
     707           0 :     int width = 0;
     708           0 :     int precision = 0;
     709             :     int zeropad;
     710             :     const char* f;
     711             :     Py_UNICODE *s;
     712             :     PyObject *string;
     713             :     /* used by sprintf */
     714             :     char buffer[21];
     715             :     /* use abuffer instead of buffer, if we need more space
     716             :      * (which can happen if there's a format specifier with width). */
     717           0 :     char *abuffer = NULL;
     718             :     char *realbuffer;
     719           0 :     Py_ssize_t abuffersize = 0;
     720             :     char fmt[60]; /* should be enough for %0width.precisionld */
     721             :     const char *copy;
     722             : 
     723             : #ifdef VA_LIST_IS_ARRAY
     724           0 :     Py_MEMCPY(count, vargs, sizeof(va_list));
     725             : #else
     726             : #ifdef  __va_copy
     727             :     __va_copy(count, vargs);
     728             : #else
     729             :     count = vargs;
     730             : #endif
     731             : #endif
     732             :      /* step 1: count the number of %S/%R/%s format specifications
     733             :       * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
     734             :       * objects once during step 3 and put the result in an array) */
     735           0 :     for (f = format; *f; f++) {
     736           0 :          if (*f == '%') {
     737           0 :              f++;
     738           0 :              while (*f && *f != '%' && !isalpha((unsigned)*f))
     739           0 :                  f++;
     740           0 :              if (!*f)
     741           0 :                  break;
     742           0 :              if (*f == 's' || *f=='S' || *f=='R')
     743           0 :                  ++callcount;
     744             :          }
     745             :     }
     746             :     /* step 2: allocate memory for the results of
     747             :      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
     748           0 :     if (callcount) {
     749           0 :         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
     750           0 :         if (!callresults) {
     751           0 :             PyErr_NoMemory();
     752           0 :             return NULL;
     753             :         }
     754           0 :         callresult = callresults;
     755             :     }
     756             :     /* step 3: figure out how large a buffer we need */
     757           0 :     for (f = format; *f; f++) {
     758           0 :         if (*f == '%') {
     759           0 :             const char* p = f++;
     760           0 :             width = 0;
     761           0 :             while (isdigit((unsigned)*f))
     762           0 :                 width = (width*10) + *f++ - '0';
     763           0 :             precision = 0;
     764           0 :             if (*f == '.') {
     765           0 :                 f++;
     766           0 :                 while (isdigit((unsigned)*f))
     767           0 :                     precision = (precision*10) + *f++ - '0';
     768             :             }
     769             : 
     770             :             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
     771             :              * they don't affect the amount of space we reserve.
     772             :              */
     773           0 :             if ((*f == 'l' || *f == 'z') &&
     774           0 :                 (f[1] == 'd' || f[1] == 'u'))
     775           0 :                 ++f;
     776             : 
     777           0 :             switch (*f) {
     778             :             case 'c':
     779             :             {
     780           0 :                 int ordinal = va_arg(count, int);
     781             : #ifdef Py_UNICODE_WIDE
     782             :                 if (ordinal < 0 || ordinal > 0x10ffff) {
     783             :                     PyErr_SetString(PyExc_OverflowError,
     784             :                                     "%c arg not in range(0x110000) "
     785             :                                     "(wide Python build)");
     786             :                     goto fail;
     787             :                 }
     788             : #else
     789           0 :                 if (ordinal < 0 || ordinal > 0xffff) {
     790           0 :                     PyErr_SetString(PyExc_OverflowError,
     791             :                                     "%c arg not in range(0x10000) "
     792             :                                     "(narrow Python build)");
     793           0 :                     goto fail;
     794             :                 }
     795             : #endif
     796             :                 /* fall through... */
     797             :             }
     798             :             case '%':
     799           0 :                 n++;
     800           0 :                 break;
     801             :             case 'd': case 'u': case 'i': case 'x':
     802           0 :                 (void) va_arg(count, int);
     803           0 :                 if (width < precision)
     804           0 :                     width = precision;
     805             :                 /* 20 bytes is enough to hold a 64-bit
     806             :                    integer.  Decimal takes the most space.
     807             :                    This isn't enough for octal.
     808             :                    If a width is specified we need more
     809             :                    (which we allocate later). */
     810           0 :                 if (width < 20)
     811           0 :                     width = 20;
     812           0 :                 n += width;
     813           0 :                 if (abuffersize < width)
     814           0 :                     abuffersize = width;
     815           0 :                 break;
     816             :             case 's':
     817             :             {
     818             :                 /* UTF-8 */
     819           0 :                 const char *s = va_arg(count, const char*);
     820           0 :                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
     821           0 :                 if (!str)
     822           0 :                     goto fail;
     823           0 :                 n += PyUnicode_GET_SIZE(str);
     824             :                 /* Remember the str and switch to the next slot */
     825           0 :                 *callresult++ = str;
     826           0 :                 break;
     827             :             }
     828             :             case 'U':
     829             :             {
     830           0 :                 PyObject *obj = va_arg(count, PyObject *);
     831             :                 assert(obj && PyUnicode_Check(obj));
     832           0 :                 n += PyUnicode_GET_SIZE(obj);
     833           0 :                 break;
     834             :             }
     835             :             case 'V':
     836             :             {
     837           0 :                 PyObject *obj = va_arg(count, PyObject *);
     838           0 :                 const char *str = va_arg(count, const char *);
     839             :                 assert(obj || str);
     840             :                 assert(!obj || PyUnicode_Check(obj));
     841           0 :                 if (obj)
     842           0 :                     n += PyUnicode_GET_SIZE(obj);
     843             :                 else
     844           0 :                     n += strlen(str);
     845           0 :                 break;
     846             :             }
     847             :             case 'S':
     848             :             {
     849           0 :                 PyObject *obj = va_arg(count, PyObject *);
     850             :                 PyObject *str;
     851             :                 assert(obj);
     852           0 :                 str = PyObject_Str(obj);
     853           0 :                 if (!str)
     854           0 :                     goto fail;
     855           0 :                 n += PyString_GET_SIZE(str);
     856             :                 /* Remember the str and switch to the next slot */
     857           0 :                 *callresult++ = str;
     858           0 :                 break;
     859             :             }
     860             :             case 'R':
     861             :             {
     862           0 :                 PyObject *obj = va_arg(count, PyObject *);
     863             :                 PyObject *repr;
     864             :                 assert(obj);
     865           0 :                 repr = PyObject_Repr(obj);
     866           0 :                 if (!repr)
     867           0 :                     goto fail;
     868           0 :                 n += PyUnicode_GET_SIZE(repr);
     869             :                 /* Remember the repr and switch to the next slot */
     870           0 :                 *callresult++ = repr;
     871           0 :                 break;
     872             :             }
     873             :             case 'p':
     874           0 :                 (void) va_arg(count, int);
     875             :                 /* maximum 64-bit pointer representation:
     876             :                  * 0xffffffffffffffff
     877             :                  * so 19 characters is enough.
     878             :                  * XXX I count 18 -- what's the extra for?
     879             :                  */
     880           0 :                 n += 19;
     881           0 :                 break;
     882             :             default:
     883             :                 /* if we stumble upon an unknown
     884             :                    formatting code, copy the rest of
     885             :                    the format string to the output
     886             :                    string. (we cannot just skip the
     887             :                    code, since there's no way to know
     888             :                    what's in the argument list) */
     889           0 :                 n += strlen(p);
     890           0 :                 goto expand;
     891             :             }
     892             :         } else
     893           0 :             n++;
     894             :     }
     895             :   expand:
     896           0 :     if (abuffersize > 20) {
     897             :         /* add 1 for sprintf's trailing null byte */
     898           0 :         abuffer = PyObject_Malloc(abuffersize + 1);
     899           0 :         if (!abuffer) {
     900           0 :             PyErr_NoMemory();
     901           0 :             goto fail;
     902             :         }
     903           0 :         realbuffer = abuffer;
     904             :     }
     905             :     else
     906           0 :         realbuffer = buffer;
     907             :     /* step 4: fill the buffer */
     908             :     /* Since we've analyzed how much space we need for the worst case,
     909             :        we don't have to resize the string.
     910             :        There can be no errors beyond this point. */
     911           0 :     string = PyUnicode_FromUnicode(NULL, n);
     912           0 :     if (!string)
     913           0 :         goto fail;
     914             : 
     915           0 :     s = PyUnicode_AS_UNICODE(string);
     916           0 :     callresult = callresults;
     917             : 
     918           0 :     for (f = format; *f; f++) {
     919           0 :         if (*f == '%') {
     920           0 :             const char* p = f++;
     921           0 :             int longflag = 0;
     922           0 :             int size_tflag = 0;
     923           0 :             zeropad = (*f == '0');
     924             :             /* parse the width.precision part */
     925           0 :             width = 0;
     926           0 :             while (isdigit((unsigned)*f))
     927           0 :                 width = (width*10) + *f++ - '0';
     928           0 :             precision = 0;
     929           0 :             if (*f == '.') {
     930           0 :                 f++;
     931           0 :                 while (isdigit((unsigned)*f))
     932           0 :                     precision = (precision*10) + *f++ - '0';
     933             :             }
     934             :             /* handle the long flag, but only for %ld and %lu.
     935             :                others can be added when necessary. */
     936           0 :             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
     937           0 :                 longflag = 1;
     938           0 :                 ++f;
     939             :             }
     940             :             /* handle the size_t flag. */
     941           0 :             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
     942           0 :                 size_tflag = 1;
     943           0 :                 ++f;
     944             :             }
     945             : 
     946           0 :             switch (*f) {
     947             :             case 'c':
     948           0 :                 *s++ = va_arg(vargs, int);
     949           0 :                 break;
     950             :             case 'd':
     951           0 :                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
     952           0 :                 if (longflag)
     953           0 :                     sprintf(realbuffer, fmt, va_arg(vargs, long));
     954           0 :                 else if (size_tflag)
     955           0 :                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
     956             :                 else
     957           0 :                     sprintf(realbuffer, fmt, va_arg(vargs, int));
     958           0 :                 appendstring(realbuffer);
     959           0 :                 break;
     960             :             case 'u':
     961           0 :                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
     962           0 :                 if (longflag)
     963           0 :                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
     964           0 :                 else if (size_tflag)
     965           0 :                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
     966             :                 else
     967           0 :                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
     968           0 :                 appendstring(realbuffer);
     969           0 :                 break;
     970             :             case 'i':
     971           0 :                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
     972           0 :                 sprintf(realbuffer, fmt, va_arg(vargs, int));
     973           0 :                 appendstring(realbuffer);
     974           0 :                 break;
     975             :             case 'x':
     976           0 :                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
     977           0 :                 sprintf(realbuffer, fmt, va_arg(vargs, int));
     978           0 :                 appendstring(realbuffer);
     979           0 :                 break;
     980             :             case 's':
     981             :             {
     982             :                 /* unused, since we already have the result */
     983           0 :                 (void) va_arg(vargs, char *);
     984           0 :                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
     985             :                                 PyUnicode_GET_SIZE(*callresult));
     986           0 :                 s += PyUnicode_GET_SIZE(*callresult);
     987             :                 /* We're done with the unicode()/repr() => forget it */
     988           0 :                 Py_DECREF(*callresult);
     989             :                 /* switch to next unicode()/repr() result */
     990           0 :                 ++callresult;
     991           0 :                 break;
     992             :             }
     993             :             case 'U':
     994             :             {
     995           0 :                 PyObject *obj = va_arg(vargs, PyObject *);
     996           0 :                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
     997           0 :                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
     998           0 :                 s += size;
     999           0 :                 break;
    1000             :             }
    1001             :             case 'V':
    1002             :             {
    1003           0 :                 PyObject *obj = va_arg(vargs, PyObject *);
    1004           0 :                 const char *str = va_arg(vargs, const char *);
    1005           0 :                 if (obj) {
    1006           0 :                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
    1007           0 :                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
    1008           0 :                     s += size;
    1009             :                 } else {
    1010           0 :                     appendstring(str);
    1011             :                 }
    1012           0 :                 break;
    1013             :             }
    1014             :             case 'S':
    1015             :             case 'R':
    1016             :             {
    1017           0 :                 const char *str = PyString_AS_STRING(*callresult);
    1018             :                 /* unused, since we already have the result */
    1019           0 :                 (void) va_arg(vargs, PyObject *);
    1020           0 :                 appendstring(str);
    1021             :                 /* We're done with the unicode()/repr() => forget it */
    1022           0 :                 Py_DECREF(*callresult);
    1023             :                 /* switch to next unicode()/repr() result */
    1024           0 :                 ++callresult;
    1025           0 :                 break;
    1026             :             }
    1027             :             case 'p':
    1028           0 :                 sprintf(buffer, "%p", va_arg(vargs, void*));
    1029             :                 /* %p is ill-defined:  ensure leading 0x. */
    1030           0 :                 if (buffer[1] == 'X')
    1031           0 :                     buffer[1] = 'x';
    1032           0 :                 else if (buffer[1] != 'x') {
    1033           0 :                     memmove(buffer+2, buffer, strlen(buffer)+1);
    1034           0 :                     buffer[0] = '0';
    1035           0 :                     buffer[1] = 'x';
    1036             :                 }
    1037           0 :                 appendstring(buffer);
    1038           0 :                 break;
    1039             :             case '%':
    1040           0 :                 *s++ = '%';
    1041           0 :                 break;
    1042             :             default:
    1043           0 :                 appendstring(p);
    1044           0 :                 goto end;
    1045             :             }
    1046             :         } else
    1047           0 :             *s++ = *f;
    1048             :     }
    1049             : 
    1050             :   end:
    1051           0 :     if (callresults)
    1052           0 :         PyObject_Free(callresults);
    1053           0 :     if (abuffer)
    1054           0 :         PyObject_Free(abuffer);
    1055           0 :     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
    1056           0 :     return string;
    1057             :   fail:
    1058           0 :     if (callresults) {
    1059           0 :         PyObject **callresult2 = callresults;
    1060           0 :         while (callresult2 < callresult) {
    1061           0 :             Py_DECREF(*callresult2);
    1062           0 :             ++callresult2;
    1063             :         }
    1064           0 :         PyObject_Free(callresults);
    1065             :     }
    1066           0 :     if (abuffer)
    1067           0 :         PyObject_Free(abuffer);
    1068           0 :     return NULL;
    1069             : }
    1070             : 
    1071             : #undef appendstring
    1072             : 
    1073             : PyObject *
    1074           0 : PyUnicode_FromFormat(const char *format, ...)
    1075             : {
    1076             :     PyObject* ret;
    1077             :     va_list vargs;
    1078             : 
    1079             : #ifdef HAVE_STDARG_PROTOTYPES
    1080           0 :     va_start(vargs, format);
    1081             : #else
    1082             :     va_start(vargs);
    1083             : #endif
    1084           0 :     ret = PyUnicode_FromFormatV(format, vargs);
    1085           0 :     va_end(vargs);
    1086           0 :     return ret;
    1087             : }
    1088             : 
    1089           0 : Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
    1090             :                                 wchar_t *w,
    1091             :                                 Py_ssize_t size)
    1092             : {
    1093           0 :     if (unicode == NULL) {
    1094           0 :         PyErr_BadInternalCall();
    1095           0 :         return -1;
    1096             :     }
    1097             : 
    1098             :     /* If possible, try to copy the 0-termination as well */
    1099           0 :     if (size > PyUnicode_GET_SIZE(unicode))
    1100           0 :         size = PyUnicode_GET_SIZE(unicode) + 1;
    1101             : 
    1102             : #ifdef HAVE_USABLE_WCHAR_T
    1103             :     memcpy(w, unicode->str, size * sizeof(wchar_t));
    1104             : #else
    1105             :     {
    1106             :         register Py_UNICODE *u;
    1107             :         register Py_ssize_t i;
    1108           0 :         u = PyUnicode_AS_UNICODE(unicode);
    1109           0 :         for (i = size; i > 0; i--)
    1110           0 :             *w++ = *u++;
    1111             :     }
    1112             : #endif
    1113             : 
    1114           0 :     if (size > PyUnicode_GET_SIZE(unicode))
    1115           0 :         return PyUnicode_GET_SIZE(unicode);
    1116             :     else
    1117           0 :         return size;
    1118             : }
    1119             : 
    1120             : #endif
    1121             : 
    1122           0 : PyObject *PyUnicode_FromOrdinal(int ordinal)
    1123             : {
    1124             :     Py_UNICODE s[1];
    1125             : 
    1126             : #ifdef Py_UNICODE_WIDE
    1127             :     if (ordinal < 0 || ordinal > 0x10ffff) {
    1128             :         PyErr_SetString(PyExc_ValueError,
    1129             :                         "unichr() arg not in range(0x110000) "
    1130             :                         "(wide Python build)");
    1131             :         return NULL;
    1132             :     }
    1133             : #else
    1134           0 :     if (ordinal < 0 || ordinal > 0xffff) {
    1135           0 :         PyErr_SetString(PyExc_ValueError,
    1136             :                         "unichr() arg not in range(0x10000) "
    1137             :                         "(narrow Python build)");
    1138           0 :         return NULL;
    1139             :     }
    1140             : #endif
    1141             : 
    1142           0 :     s[0] = (Py_UNICODE)ordinal;
    1143           0 :     return PyUnicode_FromUnicode(s, 1);
    1144             : }
    1145             : 
    1146           0 : PyObject *PyUnicode_FromObject(register PyObject *obj)
    1147             : {
    1148             :     /* XXX Perhaps we should make this API an alias of
    1149             :        PyObject_Unicode() instead ?! */
    1150           0 :     if (PyUnicode_CheckExact(obj)) {
    1151           0 :         Py_INCREF(obj);
    1152           0 :         return obj;
    1153             :     }
    1154           0 :     if (PyUnicode_Check(obj)) {
    1155             :         /* For a Unicode subtype that's not a Unicode object,
    1156             :            return a true Unicode object with the same data. */
    1157           0 :         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
    1158             :                                      PyUnicode_GET_SIZE(obj));
    1159             :     }
    1160           0 :     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
    1161             : }
    1162             : 
    1163           3 : PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
    1164             :                                       const char *encoding,
    1165             :                                       const char *errors)
    1166             : {
    1167           3 :     const char *s = NULL;
    1168             :     Py_ssize_t len;
    1169             :     PyObject *v;
    1170             : 
    1171           3 :     if (obj == NULL) {
    1172           0 :         PyErr_BadInternalCall();
    1173           0 :         return NULL;
    1174             :     }
    1175             : 
    1176             : #if 0
    1177             :     /* For b/w compatibility we also accept Unicode objects provided
    1178             :        that no encodings is given and then redirect to
    1179             :        PyObject_Unicode() which then applies the additional logic for
    1180             :        Unicode subclasses.
    1181             : 
    1182             :        NOTE: This API should really only be used for object which
    1183             :        represent *encoded* Unicode !
    1184             : 
    1185             :     */
    1186             :     if (PyUnicode_Check(obj)) {
    1187             :         if (encoding) {
    1188             :             PyErr_SetString(PyExc_TypeError,
    1189             :                             "decoding Unicode is not supported");
    1190             :             return NULL;
    1191             :         }
    1192             :         return PyObject_Unicode(obj);
    1193             :     }
    1194             : #else
    1195           3 :     if (PyUnicode_Check(obj)) {
    1196           0 :         PyErr_SetString(PyExc_TypeError,
    1197             :                         "decoding Unicode is not supported");
    1198           0 :         return NULL;
    1199             :     }
    1200             : #endif
    1201             : 
    1202             :     /* Coerce object */
    1203           3 :     if (PyString_Check(obj)) {
    1204           3 :         s = PyString_AS_STRING(obj);
    1205           3 :         len = PyString_GET_SIZE(obj);
    1206             :     }
    1207           0 :     else if (PyByteArray_Check(obj)) {
    1208             :         /* Python 2.x specific */
    1209           0 :         PyErr_Format(PyExc_TypeError,
    1210             :                      "decoding bytearray is not supported");
    1211           0 :         return NULL;
    1212             :     }
    1213           0 :     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
    1214             :         /* Overwrite the error message with something more useful in
    1215             :            case of a TypeError. */
    1216           0 :         if (PyErr_ExceptionMatches(PyExc_TypeError))
    1217           0 :             PyErr_Format(PyExc_TypeError,
    1218             :                          "coercing to Unicode: need string or buffer, "
    1219             :                          "%.80s found",
    1220           0 :                          Py_TYPE(obj)->tp_name);
    1221           0 :         goto onError;
    1222             :     }
    1223             : 
    1224             :     /* Convert to Unicode */
    1225           3 :     if (len == 0)
    1226           3 :         _Py_RETURN_UNICODE_EMPTY();
    1227             : 
    1228           0 :     v = PyUnicode_Decode(s, len, encoding, errors);
    1229           0 :     return v;
    1230             : 
    1231             :   onError:
    1232           0 :     return NULL;
    1233             : }
    1234             : 
    1235           0 : PyObject *PyUnicode_Decode(const char *s,
    1236             :                            Py_ssize_t size,
    1237             :                            const char *encoding,
    1238             :                            const char *errors)
    1239             : {
    1240           0 :     PyObject *buffer = NULL, *unicode;
    1241             : 
    1242           0 :     if (encoding == NULL)
    1243           0 :         encoding = PyUnicode_GetDefaultEncoding();
    1244             : 
    1245             :     /* Shortcuts for common default encodings */
    1246           0 :     if (strcmp(encoding, "utf-8") == 0)
    1247           0 :         return PyUnicode_DecodeUTF8(s, size, errors);
    1248           0 :     else if (strcmp(encoding, "latin-1") == 0)
    1249           0 :         return PyUnicode_DecodeLatin1(s, size, errors);
    1250             : #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
    1251             :     else if (strcmp(encoding, "mbcs") == 0)
    1252             :         return PyUnicode_DecodeMBCS(s, size, errors);
    1253             : #endif
    1254           0 :     else if (strcmp(encoding, "ascii") == 0)
    1255           0 :         return PyUnicode_DecodeASCII(s, size, errors);
    1256             : 
    1257             :     /* Decode via the codec registry */
    1258           0 :     buffer = PyBuffer_FromMemory((void *)s, size);
    1259           0 :     if (buffer == NULL)
    1260           0 :         goto onError;
    1261           0 :     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
    1262           0 :     if (unicode == NULL)
    1263           0 :         goto onError;
    1264           0 :     if (!PyUnicode_Check(unicode)) {
    1265           0 :         PyErr_Format(PyExc_TypeError,
    1266             :                      "decoder did not return an unicode object (type=%.400s)",
    1267           0 :                      Py_TYPE(unicode)->tp_name);
    1268           0 :         Py_DECREF(unicode);
    1269           0 :         goto onError;
    1270             :     }
    1271           0 :     Py_DECREF(buffer);
    1272           0 :     return unicode;
    1273             : 
    1274             :   onError:
    1275           0 :     Py_XDECREF(buffer);
    1276           0 :     return NULL;
    1277             : }
    1278             : 
    1279           0 : PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
    1280             :                                     const char *encoding,
    1281             :                                     const char *errors)
    1282             : {
    1283             :     PyObject *v;
    1284             : 
    1285           0 :     if (!PyUnicode_Check(unicode)) {
    1286           0 :         PyErr_BadArgument();
    1287           0 :         goto onError;
    1288             :     }
    1289             : 
    1290           0 :     if (PyErr_WarnPy3k("decoding Unicode is not supported in 3.x", 1) < 0)
    1291           0 :         goto onError;
    1292             : 
    1293           0 :     if (encoding == NULL)
    1294           0 :         encoding = PyUnicode_GetDefaultEncoding();
    1295             : 
    1296             :     /* Decode via the codec registry */
    1297           0 :     v = _PyCodec_DecodeText(unicode, encoding, errors);
    1298           0 :     if (v == NULL)
    1299           0 :         goto onError;
    1300           0 :     return v;
    1301             : 
    1302             :   onError:
    1303           0 :     return NULL;
    1304             : }
    1305             : 
    1306           0 : PyObject *PyUnicode_Encode(const Py_UNICODE *s,
    1307             :                            Py_ssize_t size,
    1308             :                            const char *encoding,
    1309             :                            const char *errors)
    1310             : {
    1311             :     PyObject *v, *unicode;
    1312             : 
    1313           0 :     unicode = PyUnicode_FromUnicode(s, size);
    1314           0 :     if (unicode == NULL)
    1315           0 :         return NULL;
    1316           0 :     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
    1317           0 :     Py_DECREF(unicode);
    1318           0 :     return v;
    1319             : }
    1320             : 
    1321           0 : PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
    1322             :                                     const char *encoding,
    1323             :                                     const char *errors)
    1324             : {
    1325             :     PyObject *v;
    1326             : 
    1327           0 :     if (!PyUnicode_Check(unicode)) {
    1328           0 :         PyErr_BadArgument();
    1329           0 :         goto onError;
    1330             :     }
    1331             : 
    1332           0 :     if (encoding == NULL)
    1333           0 :         encoding = PyUnicode_GetDefaultEncoding();
    1334             : 
    1335             :     /* Encode via the codec registry */
    1336           0 :     v = _PyCodec_EncodeText(unicode, encoding, errors);
    1337           0 :     if (v == NULL)
    1338           0 :         goto onError;
    1339           0 :     return v;
    1340             : 
    1341             :   onError:
    1342           0 :     return NULL;
    1343             : }
    1344             : 
    1345           0 : PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
    1346             :                                     const char *encoding,
    1347             :                                     const char *errors)
    1348             : {
    1349             :     PyObject *v;
    1350             : 
    1351           0 :     if (!PyUnicode_Check(unicode)) {
    1352           0 :         PyErr_BadArgument();
    1353           0 :         goto onError;
    1354             :     }
    1355             : 
    1356           0 :     if (encoding == NULL)
    1357           0 :         encoding = PyUnicode_GetDefaultEncoding();
    1358             : 
    1359             :     /* Shortcuts for common default encodings */
    1360           0 :     if (errors == NULL) {
    1361           0 :         if (strcmp(encoding, "utf-8") == 0)
    1362           0 :             return PyUnicode_AsUTF8String(unicode);
    1363           0 :         else if (strcmp(encoding, "latin-1") == 0)
    1364           0 :             return PyUnicode_AsLatin1String(unicode);
    1365             : #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
    1366             :         else if (strcmp(encoding, "mbcs") == 0)
    1367             :             return PyUnicode_AsMBCSString(unicode);
    1368             : #endif
    1369           0 :         else if (strcmp(encoding, "ascii") == 0)
    1370           0 :             return PyUnicode_AsASCIIString(unicode);
    1371             :     }
    1372             : 
    1373             :     /* Encode via the codec registry */
    1374           0 :     v = _PyCodec_EncodeText(unicode, encoding, errors);
    1375           0 :     if (v == NULL)
    1376           0 :         goto onError;
    1377           0 :     if (!PyString_Check(v)) {
    1378           0 :         PyErr_Format(PyExc_TypeError,
    1379             :                      "encoder did not return a string object (type=%.400s)",
    1380           0 :                      Py_TYPE(v)->tp_name);
    1381           0 :         Py_DECREF(v);
    1382           0 :         goto onError;
    1383             :     }
    1384           0 :     return v;
    1385             : 
    1386             :   onError:
    1387           0 :     return NULL;
    1388             : }
    1389             : 
    1390           0 : PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
    1391             :                                             const char *errors)
    1392             : {
    1393           0 :     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
    1394             : 
    1395           0 :     if (v)
    1396           0 :         return v;
    1397           0 :     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
    1398           0 :     if (v && errors == NULL)
    1399           0 :         ((PyUnicodeObject *)unicode)->defenc = v;
    1400           0 :     return v;
    1401             : }
    1402             : 
    1403           0 : Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
    1404             : {
    1405           0 :     if (!PyUnicode_Check(unicode)) {
    1406           0 :         PyErr_BadArgument();
    1407           0 :         goto onError;
    1408             :     }
    1409           0 :     return PyUnicode_AS_UNICODE(unicode);
    1410             : 
    1411             :   onError:
    1412           0 :     return NULL;
    1413             : }
    1414             : 
    1415           0 : Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
    1416             : {
    1417           0 :     if (!PyUnicode_Check(unicode)) {
    1418           0 :         PyErr_BadArgument();
    1419           0 :         goto onError;
    1420             :     }
    1421           0 :     return PyUnicode_GET_SIZE(unicode);
    1422             : 
    1423             :   onError:
    1424           0 :     return -1;
    1425             : }
    1426             : 
    1427           0 : const char *PyUnicode_GetDefaultEncoding(void)
    1428             : {
    1429           0 :     return unicode_default_encoding;
    1430             : }
    1431             : 
    1432           0 : int PyUnicode_SetDefaultEncoding(const char *encoding)
    1433             : {
    1434             :     PyObject *v;
    1435             : 
    1436             :     /* Make sure the encoding is valid. As side effect, this also
    1437             :        loads the encoding into the codec registry cache. */
    1438           0 :     v = _PyCodec_Lookup(encoding);
    1439           0 :     if (v == NULL)
    1440           0 :         goto onError;
    1441           0 :     Py_DECREF(v);
    1442           0 :     strncpy(unicode_default_encoding,
    1443             :             encoding,
    1444             :             sizeof(unicode_default_encoding) - 1);
    1445           0 :     return 0;
    1446             : 
    1447             :   onError:
    1448           0 :     return -1;
    1449             : }
    1450             : 
    1451             : /* error handling callback helper:
    1452             :    build arguments, call the callback and check the arguments,
    1453             :    if no exception occurred, copy the replacement to the output
    1454             :    and adjust various state variables.
    1455             :    return 0 on success, -1 on error
    1456             : */
    1457             : 
    1458             : static
    1459           0 : int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
    1460             :                                      const char *encoding, const char *reason,
    1461             :                                      const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
    1462             :                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
    1463             :                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
    1464             : {
    1465             :     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
    1466             : 
    1467           0 :     PyObject *restuple = NULL;
    1468           0 :     PyObject *repunicode = NULL;
    1469           0 :     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
    1470             :     Py_ssize_t requiredsize;
    1471             :     Py_ssize_t newpos;
    1472             :     Py_UNICODE *repptr;
    1473             :     Py_ssize_t repsize;
    1474           0 :     int res = -1;
    1475             : 
    1476           0 :     if (*errorHandler == NULL) {
    1477           0 :         *errorHandler = PyCodec_LookupError(errors);
    1478           0 :         if (*errorHandler == NULL)
    1479           0 :             goto onError;
    1480             :     }
    1481             : 
    1482           0 :     if (*exceptionObject == NULL) {
    1483           0 :         *exceptionObject = PyUnicodeDecodeError_Create(
    1484             :             encoding, input, insize, *startinpos, *endinpos, reason);
    1485           0 :         if (*exceptionObject == NULL)
    1486           0 :             goto onError;
    1487             :     }
    1488             :     else {
    1489           0 :         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
    1490           0 :             goto onError;
    1491           0 :         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
    1492           0 :             goto onError;
    1493           0 :         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
    1494           0 :             goto onError;
    1495             :     }
    1496             : 
    1497           0 :     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
    1498           0 :     if (restuple == NULL)
    1499           0 :         goto onError;
    1500           0 :     if (!PyTuple_Check(restuple)) {
    1501           0 :         PyErr_SetString(PyExc_TypeError, &argparse[4]);
    1502           0 :         goto onError;
    1503             :     }
    1504           0 :     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
    1505           0 :         goto onError;
    1506           0 :     if (newpos<0)
    1507           0 :         newpos = insize+newpos;
    1508           0 :     if (newpos<0 || newpos>insize) {
    1509           0 :         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
    1510           0 :         goto onError;
    1511             :     }
    1512             : 
    1513             :     /* need more space? (at least enough for what we
    1514             :        have+the replacement+the rest of the string (starting
    1515             :        at the new input position), so we won't have to check space
    1516             :        when there are no errors in the rest of the string) */
    1517           0 :     repptr = PyUnicode_AS_UNICODE(repunicode);
    1518           0 :     repsize = PyUnicode_GET_SIZE(repunicode);
    1519           0 :     requiredsize = *outpos;
    1520           0 :     if (requiredsize > PY_SSIZE_T_MAX - repsize)
    1521           0 :         goto overflow;
    1522           0 :     requiredsize += repsize;
    1523           0 :     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
    1524           0 :         goto overflow;
    1525           0 :     requiredsize += insize - newpos;
    1526           0 :     if (requiredsize > outsize) {
    1527           0 :         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
    1528           0 :             requiredsize = 2*outsize;
    1529           0 :         if (_PyUnicode_Resize(output, requiredsize) < 0)
    1530           0 :             goto onError;
    1531           0 :         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
    1532             :     }
    1533           0 :     *endinpos = newpos;
    1534           0 :     *inptr = input + newpos;
    1535           0 :     Py_UNICODE_COPY(*outptr, repptr, repsize);
    1536           0 :     *outptr += repsize;
    1537           0 :     *outpos += repsize;
    1538             :     /* we made it! */
    1539           0 :     res = 0;
    1540             : 
    1541             :   onError:
    1542           0 :     Py_XDECREF(restuple);
    1543           0 :     return res;
    1544             : 
    1545             :   overflow:
    1546           0 :     PyErr_SetString(PyExc_OverflowError,
    1547             :                     "decoded result is too long for a Python string");
    1548           0 :     goto onError;
    1549             : }
    1550             : 
    1551             : /* --- UTF-7 Codec -------------------------------------------------------- */
    1552             : 
    1553             : /* See RFC2152 for details.  We encode conservatively and decode liberally. */
    1554             : 
    1555             : /* Three simple macros defining base-64. */
    1556             : 
    1557             : /* Is c a base-64 character? */
    1558             : 
    1559             : #define IS_BASE64(c) \
    1560             :     (((c) >= 'A' && (c) <= 'Z') ||     \
    1561             :      ((c) >= 'a' && (c) <= 'z') ||     \
    1562             :      ((c) >= '0' && (c) <= '9') ||     \
    1563             :      (c) == '+' || (c) == '/')
    1564             : 
    1565             : /* given that c is a base-64 character, what is its base-64 value? */
    1566             : 
    1567             : #define FROM_BASE64(c)                                                  \
    1568             :     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
    1569             :      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
    1570             :      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
    1571             :      (c) == '+' ? 62 : 63)
    1572             : 
    1573             : /* What is the base-64 character of the bottom 6 bits of n? */
    1574             : 
    1575             : #define TO_BASE64(n)  \
    1576             :     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
    1577             : 
    1578             : /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
    1579             :  * decoded as itself.  We are permissive on decoding; the only ASCII
    1580             :  * byte not decoding to itself is the + which begins a base64
    1581             :  * string. */
    1582             : 
    1583             : #define DECODE_DIRECT(c)                                \
    1584             :     ((c) <= 127 && (c) != '+')
    1585             : 
    1586             : /* The UTF-7 encoder treats ASCII characters differently according to
    1587             :  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
    1588             :  * the above).  See RFC2152.  This array identifies these different
    1589             :  * sets:
    1590             :  * 0 : "Set D"
    1591             :  *     alphanumeric and '(),-./:?
    1592             :  * 1 : "Set O"
    1593             :  *     !"#$%&*;<=>@[]^_`{|}
    1594             :  * 2 : "whitespace"
    1595             :  *     ht nl cr sp
    1596             :  * 3 : special (must be base64 encoded)
    1597             :  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
    1598             :  */
    1599             : 
    1600             : static
    1601             : char utf7_category[128] = {
    1602             : /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
    1603             :     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
    1604             : /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
    1605             :     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
    1606             : /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
    1607             :     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
    1608             : /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
    1609             :     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
    1610             : /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
    1611             :     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    1612             : /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
    1613             :     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
    1614             : /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
    1615             :     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    1616             : /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
    1617             :     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
    1618             : };
    1619             : 
    1620             : /* ENCODE_DIRECT: this character should be encoded as itself.  The
    1621             :  * answer depends on whether we are encoding set O as itself, and also
    1622             :  * on whether we are encoding whitespace as itself.  RFC2152 makes it
    1623             :  * clear that the answers to these questions vary between
    1624             :  * applications, so this code needs to be flexible.  */
    1625             : 
    1626             : #define ENCODE_DIRECT(c, directO, directWS)             \
    1627             :     ((c) < 128 && (c) > 0 &&                            \
    1628             :      ((utf7_category[(c)] == 0) ||                      \
    1629             :       (directWS && (utf7_category[(c)] == 2)) ||        \
    1630             :       (directO && (utf7_category[(c)] == 1))))
    1631             : 
    1632           0 : PyObject *PyUnicode_DecodeUTF7(const char *s,
    1633             :                                Py_ssize_t size,
    1634             :                                const char *errors)
    1635             : {
    1636           0 :     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
    1637             : }
    1638             : 
    1639             : /* The decoder.  The only state we preserve is our read position,
    1640             :  * i.e. how many characters we have consumed.  So if we end in the
    1641             :  * middle of a shift sequence we have to back off the read position
    1642             :  * and the output to the beginning of the sequence, otherwise we lose
    1643             :  * all the shift state (seen bits, number of bits seen, high
    1644             :  * surrogate). */
    1645             : 
    1646           0 : PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
    1647             :                                        Py_ssize_t size,
    1648             :                                        const char *errors,
    1649             :                                        Py_ssize_t *consumed)
    1650             : {
    1651           0 :     const char *starts = s;
    1652             :     Py_ssize_t startinpos;
    1653             :     Py_ssize_t endinpos;
    1654             :     Py_ssize_t outpos;
    1655             :     const char *e;
    1656             :     PyUnicodeObject *unicode;
    1657             :     Py_UNICODE *p;
    1658           0 :     const char *errmsg = "";
    1659           0 :     int inShift = 0;
    1660             :     Py_UNICODE *shiftOutStart;
    1661           0 :     unsigned int base64bits = 0;
    1662           0 :     unsigned long base64buffer = 0;
    1663           0 :     Py_UNICODE surrogate = 0;
    1664           0 :     PyObject *errorHandler = NULL;
    1665           0 :     PyObject *exc = NULL;
    1666             : 
    1667           0 :     unicode = _PyUnicode_New(size);
    1668           0 :     if (!unicode)
    1669           0 :         return NULL;
    1670           0 :     if (size == 0) {
    1671           0 :         if (consumed)
    1672           0 :             *consumed = 0;
    1673           0 :         return (PyObject *)unicode;
    1674             :     }
    1675             : 
    1676           0 :     p = unicode->str;
    1677           0 :     shiftOutStart = p;
    1678           0 :     e = s + size;
    1679             : 
    1680           0 :     while (s < e) {
    1681           0 :         Py_UNICODE ch = (unsigned char) *s;
    1682             : 
    1683           0 :         if (inShift) { /* in a base-64 section */
    1684           0 :             if (IS_BASE64(ch)) { /* consume a base-64 character */
    1685           0 :                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
    1686           0 :                 base64bits += 6;
    1687           0 :                 s++;
    1688           0 :                 if (base64bits >= 16) {
    1689             :                     /* we have enough bits for a UTF-16 value */
    1690           0 :                     Py_UNICODE outCh = (Py_UNICODE)
    1691           0 :                                        (base64buffer >> (base64bits-16));
    1692           0 :                     base64bits -= 16;
    1693           0 :                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
    1694             :                     assert(outCh <= 0xffff);
    1695           0 :                     if (surrogate) {
    1696             :                         /* expecting a second surrogate */
    1697           0 :                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
    1698             : #ifdef Py_UNICODE_WIDE
    1699             :                             *p++ = (((surrogate & 0x3FF)<<10)
    1700             :                                     | (outCh & 0x3FF)) + 0x10000;
    1701             : #else
    1702           0 :                             *p++ = surrogate;
    1703           0 :                             *p++ = outCh;
    1704             : #endif
    1705           0 :                             surrogate = 0;
    1706           0 :                             continue;
    1707             :                         }
    1708             :                         else {
    1709           0 :                             *p++ = surrogate;
    1710           0 :                             surrogate = 0;
    1711             :                         }
    1712             :                     }
    1713           0 :                     if (outCh >= 0xD800 && outCh <= 0xDBFF) {
    1714             :                         /* first surrogate */
    1715           0 :                         surrogate = outCh;
    1716             :                     }
    1717             :                     else {
    1718           0 :                         *p++ = outCh;
    1719             :                     }
    1720             :                 }
    1721             :             }
    1722             :             else { /* now leaving a base-64 section */
    1723           0 :                 inShift = 0;
    1724           0 :                 if (base64bits > 0) { /* left-over bits */
    1725           0 :                     if (base64bits >= 6) {
    1726             :                         /* We've seen at least one base-64 character */
    1727           0 :                         s++;
    1728           0 :                         errmsg = "partial character in shift sequence";
    1729           0 :                         goto utf7Error;
    1730             :                     }
    1731             :                     else {
    1732             :                         /* Some bits remain; they should be zero */
    1733           0 :                         if (base64buffer != 0) {
    1734           0 :                             s++;
    1735           0 :                             errmsg = "non-zero padding bits in shift sequence";
    1736           0 :                             goto utf7Error;
    1737             :                         }
    1738             :                     }
    1739             :                 }
    1740           0 :                 if (surrogate && DECODE_DIRECT(ch))
    1741           0 :                     *p++ = surrogate;
    1742           0 :                 surrogate = 0;
    1743           0 :                 if (ch == '-') {
    1744             :                     /* '-' is absorbed; other terminating
    1745             :                        characters are preserved */
    1746           0 :                     s++;
    1747             :                 }
    1748             :             }
    1749             :         }
    1750           0 :         else if ( ch == '+' ) {
    1751           0 :             startinpos = s-starts;
    1752           0 :             s++; /* consume '+' */
    1753           0 :             if (s < e && *s == '-') { /* '+-' encodes '+' */
    1754           0 :                 s++;
    1755           0 :                 *p++ = '+';
    1756             :             }
    1757             :             else { /* begin base64-encoded section */
    1758           0 :                 inShift = 1;
    1759           0 :                 surrogate = 0;
    1760           0 :                 shiftOutStart = p;
    1761           0 :                 base64bits = 0;
    1762           0 :                 base64buffer = 0;
    1763             :             }
    1764             :         }
    1765           0 :         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
    1766           0 :             *p++ = ch;
    1767           0 :             s++;
    1768             :         }
    1769             :         else {
    1770           0 :             startinpos = s-starts;
    1771           0 :             s++;
    1772           0 :             errmsg = "unexpected special character";
    1773           0 :             goto utf7Error;
    1774             :         }
    1775           0 :         continue;
    1776             : utf7Error:
    1777           0 :         outpos = p-PyUnicode_AS_UNICODE(unicode);
    1778           0 :         endinpos = s-starts;
    1779           0 :         if (unicode_decode_call_errorhandler(
    1780             :                 errors, &errorHandler,
    1781             :                 "utf7", errmsg,
    1782             :                 starts, size, &startinpos, &endinpos, &exc, &s,
    1783             :                 &unicode, &outpos, &p))
    1784           0 :             goto onError;
    1785             :     }
    1786             : 
    1787             :     /* end of string */
    1788             : 
    1789           0 :     if (inShift && !consumed) { /* in shift sequence, no more to follow */
    1790             :         /* if we're in an inconsistent state, that's an error */
    1791           0 :         inShift = 0;
    1792           0 :         if (surrogate ||
    1793           0 :                 (base64bits >= 6) ||
    1794           0 :                 (base64bits > 0 && base64buffer != 0)) {
    1795           0 :             outpos = p-PyUnicode_AS_UNICODE(unicode);
    1796           0 :             endinpos = size;
    1797           0 :             if (unicode_decode_call_errorhandler(
    1798             :                     errors, &errorHandler,
    1799             :                     "utf7", "unterminated shift sequence",
    1800             :                     starts, size, &startinpos, &endinpos, &exc, &s,
    1801             :                     &unicode, &outpos, &p))
    1802           0 :                 goto onError;
    1803             :         }
    1804             :     }
    1805             : 
    1806             :     /* return state */
    1807           0 :     if (consumed) {
    1808           0 :         if (inShift) {
    1809           0 :             p = shiftOutStart; /* back off output */
    1810           0 :             *consumed = startinpos;
    1811             :         }
    1812             :         else {
    1813           0 :             *consumed = s-starts;
    1814             :         }
    1815             :     }
    1816             : 
    1817           0 :     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
    1818           0 :         goto onError;
    1819             : 
    1820           0 :     Py_XDECREF(errorHandler);
    1821           0 :     Py_XDECREF(exc);
    1822           0 :     return (PyObject *)unicode;
    1823             : 
    1824             :   onError:
    1825           0 :     Py_XDECREF(errorHandler);
    1826           0 :     Py_XDECREF(exc);
    1827           0 :     Py_DECREF(unicode);
    1828           0 :     return NULL;
    1829             : }
    1830             : 
    1831             : 
    1832           0 : PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
    1833             :                                Py_ssize_t size,
    1834             :                                int base64SetO,
    1835             :                                int base64WhiteSpace,
    1836             :                                const char *errors)
    1837             : {
    1838             :     PyObject *v;
    1839             :     /* It might be possible to tighten this worst case */
    1840           0 :     Py_ssize_t allocated = 8 * size;
    1841           0 :     int inShift = 0;
    1842           0 :     Py_ssize_t i = 0;
    1843           0 :     unsigned int base64bits = 0;
    1844           0 :     unsigned long base64buffer = 0;
    1845             :     char * out;
    1846             :     char * start;
    1847             : 
    1848           0 :     if (allocated / 8 != size)
    1849           0 :         return PyErr_NoMemory();
    1850             : 
    1851           0 :     if (size == 0)
    1852           0 :         return PyString_FromStringAndSize(NULL, 0);
    1853             : 
    1854           0 :     v = PyString_FromStringAndSize(NULL, allocated);
    1855           0 :     if (v == NULL)
    1856           0 :         return NULL;
    1857             : 
    1858           0 :     start = out = PyString_AS_STRING(v);
    1859           0 :     for (;i < size; ++i) {
    1860           0 :         Py_UNICODE ch = s[i];
    1861             : 
    1862           0 :         if (inShift) {
    1863           0 :             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
    1864             :                 /* shifting out */
    1865           0 :                 if (base64bits) { /* output remaining bits */
    1866           0 :                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
    1867           0 :                     base64buffer = 0;
    1868           0 :                     base64bits = 0;
    1869             :                 }
    1870           0 :                 inShift = 0;
    1871             :                 /* Characters not in the BASE64 set implicitly unshift the sequence
    1872             :                    so no '-' is required, except if the character is itself a '-' */
    1873           0 :                 if (IS_BASE64(ch) || ch == '-') {
    1874           0 :                     *out++ = '-';
    1875             :                 }
    1876           0 :                 *out++ = (char) ch;
    1877             :             }
    1878             :             else {
    1879             :                 goto encode_char;
    1880             :             }
    1881             :         }
    1882             :         else { /* not in a shift sequence */
    1883           0 :             if (ch == '+') {
    1884           0 :                 *out++ = '+';
    1885           0 :                         *out++ = '-';
    1886             :             }
    1887           0 :             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
    1888           0 :                 *out++ = (char) ch;
    1889             :             }
    1890             :             else {
    1891           0 :                 *out++ = '+';
    1892           0 :                 inShift = 1;
    1893           0 :                 goto encode_char;
    1894             :             }
    1895             :         }
    1896           0 :         continue;
    1897             : encode_char:
    1898             : #ifdef Py_UNICODE_WIDE
    1899             :         if (ch >= 0x10000) {
    1900             :             /* code first surrogate */
    1901             :             base64bits += 16;
    1902             :             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
    1903             :             while (base64bits >= 6) {
    1904             :                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
    1905             :                 base64bits -= 6;
    1906             :             }
    1907             :             /* prepare second surrogate */
    1908             :             ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
    1909             :         }
    1910             : #endif
    1911           0 :         base64bits += 16;
    1912           0 :         base64buffer = (base64buffer << 16) | ch;
    1913           0 :         while (base64bits >= 6) {
    1914           0 :             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
    1915           0 :             base64bits -= 6;
    1916             :         }
    1917             :     }
    1918           0 :     if (base64bits)
    1919           0 :         *out++= TO_BASE64(base64buffer << (6-base64bits) );
    1920           0 :     if (inShift)
    1921           0 :         *out++ = '-';
    1922             : 
    1923           0 :     if (_PyString_Resize(&v, out - start))
    1924           0 :         return NULL;
    1925           0 :     return v;
    1926             : }
    1927             : 
    1928             : #undef IS_BASE64
    1929             : #undef FROM_BASE64
    1930             : #undef TO_BASE64
    1931             : #undef DECODE_DIRECT
    1932             : #undef ENCODE_DIRECT
    1933             : 
    1934             : /* --- UTF-8 Codec -------------------------------------------------------- */
    1935             : 
    1936             : static
    1937             : char utf8_code_length[256] = {
    1938             :     /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
    1939             :        illegal prefix.  See RFC 3629 for details */
    1940             :     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
    1941             :     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1942             :     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1943             :     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1944             :     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1945             :     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1946             :     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1947             :     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
    1948             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
    1949             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1950             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1951             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
    1952             :     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
    1953             :     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
    1954             :     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
    1955             :     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
    1956             : };
    1957             : 
    1958          48 : PyObject *PyUnicode_DecodeUTF8(const char *s,
    1959             :                                Py_ssize_t size,
    1960             :                                const char *errors)
    1961             : {
    1962          48 :     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
    1963             : }
    1964             : 
    1965          48 : PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
    1966             :                                        Py_ssize_t size,
    1967             :                                        const char *errors,
    1968             :                                        Py_ssize_t *consumed)
    1969             : {
    1970          48 :     const char *starts = s;
    1971             :     int n;
    1972             :     int k;
    1973             :     Py_ssize_t startinpos;
    1974             :     Py_ssize_t endinpos;
    1975             :     Py_ssize_t outpos;
    1976             :     const char *e;
    1977             :     PyUnicodeObject *unicode;
    1978             :     Py_UNICODE *p;
    1979          48 :     const char *errmsg = "";
    1980          48 :     PyObject *errorHandler = NULL;
    1981          48 :     PyObject *exc = NULL;
    1982             : 
    1983             :     /* Note: size will always be longer than the resulting Unicode
    1984             :        character count */
    1985          48 :     unicode = _PyUnicode_New(size);
    1986          48 :     if (!unicode)
    1987           0 :         return NULL;
    1988          48 :     if (size == 0) {
    1989           6 :         if (consumed)
    1990           0 :             *consumed = 0;
    1991           6 :         return (PyObject *)unicode;
    1992             :     }
    1993             : 
    1994             :     /* Unpack UTF-8 encoded data */
    1995          42 :     p = unicode->str;
    1996          42 :     e = s + size;
    1997             : 
    1998         174 :     while (s < e) {
    1999          90 :         Py_UCS4 ch = (unsigned char)*s;
    2000             : 
    2001          90 :         if (ch < 0x80) {
    2002          90 :             *p++ = (Py_UNICODE)ch;
    2003          90 :             s++;
    2004          90 :             continue;
    2005             :         }
    2006             : 
    2007           0 :         n = utf8_code_length[ch];
    2008             : 
    2009           0 :         if (s + n > e) {
    2010           0 :             if (consumed)
    2011           0 :                 break;
    2012             :             else {
    2013           0 :                 errmsg = "unexpected end of data";
    2014           0 :                 startinpos = s-starts;
    2015           0 :                 endinpos = startinpos+1;
    2016           0 :                 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
    2017           0 :                     endinpos++;
    2018           0 :                 goto utf8Error;
    2019             :             }
    2020             :         }
    2021             : 
    2022           0 :         switch (n) {
    2023             : 
    2024             :         case 0:
    2025           0 :             errmsg = "invalid start byte";
    2026           0 :             startinpos = s-starts;
    2027           0 :             endinpos = startinpos+1;
    2028           0 :             goto utf8Error;
    2029             : 
    2030             :         case 1:
    2031           0 :             errmsg = "internal error";
    2032           0 :             startinpos = s-starts;
    2033           0 :             endinpos = startinpos+1;
    2034           0 :             goto utf8Error;
    2035             : 
    2036             :         case 2:
    2037           0 :             if ((s[1] & 0xc0) != 0x80) {
    2038           0 :                 errmsg = "invalid continuation byte";
    2039           0 :                 startinpos = s-starts;
    2040           0 :                 endinpos = startinpos + 1;
    2041           0 :                 goto utf8Error;
    2042             :             }
    2043           0 :             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
    2044             :             assert ((ch > 0x007F) && (ch <= 0x07FF));
    2045           0 :             *p++ = (Py_UNICODE)ch;
    2046           0 :             break;
    2047             : 
    2048             :         case 3:
    2049             :             /* XXX: surrogates shouldn't be valid UTF-8!
    2050             :                see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
    2051             :                (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
    2052             :                Uncomment the 2 lines below to make them invalid,
    2053             :                code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
    2054           0 :             if ((s[1] & 0xc0) != 0x80 ||
    2055           0 :                 (s[2] & 0xc0) != 0x80 ||
    2056           0 :                 ((unsigned char)s[0] == 0xE0 &&
    2057           0 :                  (unsigned char)s[1] < 0xA0)/* ||
    2058             :                 ((unsigned char)s[0] == 0xED &&
    2059             :                  (unsigned char)s[1] > 0x9F)*/) {
    2060           0 :                 errmsg = "invalid continuation byte";
    2061           0 :                 startinpos = s-starts;
    2062           0 :                 endinpos = startinpos + 1;
    2063             : 
    2064             :                 /* if s[1] first two bits are 1 and 0, then the invalid
    2065             :                    continuation byte is s[2], so increment endinpos by 1,
    2066             :                    if not, s[1] is invalid and endinpos doesn't need to
    2067             :                    be incremented. */
    2068           0 :                 if ((s[1] & 0xC0) == 0x80)
    2069           0 :                     endinpos++;
    2070           0 :                 goto utf8Error;
    2071             :             }
    2072           0 :             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
    2073             :             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
    2074           0 :             *p++ = (Py_UNICODE)ch;
    2075           0 :             break;
    2076             : 
    2077             :         case 4:
    2078           0 :             if ((s[1] & 0xc0) != 0x80 ||
    2079           0 :                 (s[2] & 0xc0) != 0x80 ||
    2080           0 :                 (s[3] & 0xc0) != 0x80 ||
    2081           0 :                 ((unsigned char)s[0] == 0xF0 &&
    2082           0 :                  (unsigned char)s[1] < 0x90) ||
    2083           0 :                 ((unsigned char)s[0] == 0xF4 &&
    2084           0 :                  (unsigned char)s[1] > 0x8F)) {
    2085           0 :                 errmsg = "invalid continuation byte";
    2086           0 :                 startinpos = s-starts;
    2087           0 :                 endinpos = startinpos + 1;
    2088           0 :                 if ((s[1] & 0xC0) == 0x80) {
    2089           0 :                     endinpos++;
    2090           0 :                     if ((s[2] & 0xC0) == 0x80)
    2091           0 :                         endinpos++;
    2092             :                 }
    2093           0 :                 goto utf8Error;
    2094             :             }
    2095           0 :             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
    2096           0 :                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
    2097             :             assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
    2098             : 
    2099             : #ifdef Py_UNICODE_WIDE
    2100             :             *p++ = (Py_UNICODE)ch;
    2101             : #else
    2102             :             /*  compute and append the two surrogates: */
    2103             : 
    2104             :             /*  translate from 10000..10FFFF to 0..FFFF */
    2105           0 :             ch -= 0x10000;
    2106             : 
    2107             :             /*  high surrogate = top 10 bits added to D800 */
    2108           0 :             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
    2109             : 
    2110             :             /*  low surrogate = bottom 10 bits added to DC00 */
    2111           0 :             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
    2112             : #endif
    2113           0 :             break;
    2114             :         }
    2115           0 :         s += n;
    2116           0 :         continue;
    2117             : 
    2118             :       utf8Error:
    2119           0 :         outpos = p-PyUnicode_AS_UNICODE(unicode);
    2120           0 :         if (unicode_decode_call_errorhandler(
    2121             :                 errors, &errorHandler,
    2122             :                 "utf8", errmsg,
    2123             :                 starts, size, &startinpos, &endinpos, &exc, &s,
    2124             :                 &unicode, &outpos, &p))
    2125           0 :             goto onError;
    2126             :     }
    2127          42 :     if (consumed)
    2128           0 :         *consumed = s-starts;
    2129             : 
    2130             :     /* Adjust length */
    2131          42 :     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
    2132           0 :         goto onError;
    2133             : 
    2134          42 :     Py_XDECREF(errorHandler);
    2135          42 :     Py_XDECREF(exc);
    2136          42 :     return (PyObject *)unicode;
    2137             : 
    2138             :   onError:
    2139           0 :     Py_XDECREF(errorHandler);
    2140           0 :     Py_XDECREF(exc);
    2141           0 :     Py_DECREF(unicode);
    2142           0 :     return NULL;
    2143             : }
    2144             : 
    2145             : /* Allocation strategy:  if the string is short, convert into a stack buffer
    2146             :    and allocate exactly as much space needed at the end.  Else allocate the
    2147             :    maximum possible needed (4 result bytes per Unicode character), and return
    2148             :    the excess memory at the end.
    2149             : */
    2150             : PyObject *
    2151           0 : PyUnicode_EncodeUTF8(const Py_UNICODE *s,
    2152             :                      Py_ssize_t size,
    2153             :                      const char *errors)
    2154             : {
    2155             : #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
    2156             : 
    2157             :     Py_ssize_t i;           /* index into s of next input byte */
    2158             :     PyObject *v;        /* result string object */
    2159             :     char *p;            /* next free byte in output buffer */
    2160             :     Py_ssize_t nallocated;  /* number of result bytes allocated */
    2161             :     Py_ssize_t nneeded;        /* number of result bytes needed */
    2162             :     char stackbuf[MAX_SHORT_UNICHARS * 4];
    2163             : 
    2164             :     assert(s != NULL);
    2165             :     assert(size >= 0);
    2166             : 
    2167           0 :     if (size <= MAX_SHORT_UNICHARS) {
    2168             :         /* Write into the stack buffer; nallocated can't overflow.
    2169             :          * At the end, we'll allocate exactly as much heap space as it
    2170             :          * turns out we need.
    2171             :          */
    2172           0 :         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
    2173           0 :         v = NULL;   /* will allocate after we're done */
    2174           0 :         p = stackbuf;
    2175             :     }
    2176             :     else {
    2177             :         /* Overallocate on the heap, and give the excess back at the end. */
    2178           0 :         nallocated = size * 4;
    2179           0 :         if (nallocated / 4 != size)  /* overflow! */
    2180           0 :             return PyErr_NoMemory();
    2181           0 :         v = PyString_FromStringAndSize(NULL, nallocated);
    2182           0 :         if (v == NULL)
    2183           0 :             return NULL;
    2184           0 :         p = PyString_AS_STRING(v);
    2185             :     }
    2186             : 
    2187           0 :     for (i = 0; i < size;) {
    2188           0 :         Py_UCS4 ch = s[i++];
    2189             : 
    2190           0 :         if (ch < 0x80)
    2191             :             /* Encode ASCII */
    2192           0 :             *p++ = (char) ch;
    2193             : 
    2194           0 :         else if (ch < 0x0800) {
    2195             :             /* Encode Latin-1 */
    2196           0 :             *p++ = (char)(0xc0 | (ch >> 6));
    2197           0 :             *p++ = (char)(0x80 | (ch & 0x3f));
    2198             :         }
    2199             :         else {
    2200             :             /* Encode UCS2 Unicode ordinals */
    2201           0 :             if (ch < 0x10000) {
    2202             :                 /* Special case: check for high surrogate */
    2203           0 :                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
    2204           0 :                     Py_UCS4 ch2 = s[i];
    2205             :                     /* Check for low surrogate and combine the two to
    2206             :                        form a UCS4 value */
    2207           0 :                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
    2208           0 :                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
    2209           0 :                         i++;
    2210           0 :                         goto encodeUCS4;
    2211             :                     }
    2212             :                     /* Fall through: handles isolated high surrogates */
    2213             :                 }
    2214           0 :                 *p++ = (char)(0xe0 | (ch >> 12));
    2215           0 :                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
    2216           0 :                 *p++ = (char)(0x80 | (ch & 0x3f));
    2217           0 :                 continue;
    2218             :             }
    2219             :           encodeUCS4:
    2220             :             /* Encode UCS4 Unicode ordinals */
    2221           0 :             *p++ = (char)(0xf0 | (ch >> 18));
    2222           0 :             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
    2223           0 :             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
    2224           0 :             *p++ = (char)(0x80 | (ch & 0x3f));
    2225             :         }
    2226             :     }
    2227             : 
    2228           0 :     if (v == NULL) {
    2229             :         /* This was stack allocated. */
    2230           0 :         nneeded = p - stackbuf;
    2231             :         assert(nneeded <= nallocated);
    2232           0 :         v = PyString_FromStringAndSize(stackbuf, nneeded);
    2233             :     }
    2234             :     else {
    2235             :         /* Cut back to size actually needed. */
    2236           0 :         nneeded = p - PyString_AS_STRING(v);
    2237             :         assert(nneeded <= nallocated);
    2238           0 :         if (_PyString_Resize(&v, nneeded))
    2239           0 :             return NULL;
    2240             :     }
    2241           0 :     return v;
    2242             : 
    2243             : #undef MAX_SHORT_UNICHARS
    2244             : }
    2245             : 
    2246           0 : PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
    2247             : {
    2248           0 :     if (!PyUnicode_Check(unicode)) {
    2249           0 :         PyErr_BadArgument();
    2250           0 :         return NULL;
    2251             :     }
    2252           0 :     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
    2253             :                                 PyUnicode_GET_SIZE(unicode),
    2254             :                                 NULL);
    2255             : }
    2256             : 
    2257             : /* --- UTF-32 Codec ------------------------------------------------------- */
    2258             : 
    2259             : PyObject *
    2260           0 : PyUnicode_DecodeUTF32(const char *s,
    2261             :                       Py_ssize_t size,
    2262             :                       const char *errors,
    2263             :                       int *byteorder)
    2264             : {
    2265           0 :     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
    2266             : }
    2267             : 
    2268             : PyObject *
    2269           0 : PyUnicode_DecodeUTF32Stateful(const char *s,
    2270             :                               Py_ssize_t size,
    2271             :                               const char *errors,
    2272             :                               int *byteorder,
    2273             :                               Py_ssize_t *consumed)
    2274             : {
    2275           0 :     const char *starts = s;
    2276             :     Py_ssize_t startinpos;
    2277             :     Py_ssize_t endinpos;
    2278             :     Py_ssize_t outpos;
    2279             :     PyUnicodeObject *unicode;
    2280             :     Py_UNICODE *p;
    2281             : #ifndef Py_UNICODE_WIDE
    2282           0 :     int pairs = 0;
    2283             :     const unsigned char *qq;
    2284             : #else
    2285             :     const int pairs = 0;
    2286             : #endif
    2287             :     const unsigned char *q, *e;
    2288           0 :     int bo = 0;       /* assume native ordering by default */
    2289           0 :     const char *errmsg = "";
    2290             :     /* Offsets from q for retrieving bytes in the right order. */
    2291             : #ifdef BYTEORDER_IS_LITTLE_ENDIAN
    2292           0 :     int iorder[] = {0, 1, 2, 3};
    2293             : #else
    2294             :     int iorder[] = {3, 2, 1, 0};
    2295             : #endif
    2296           0 :     PyObject *errorHandler = NULL;
    2297           0 :     PyObject *exc = NULL;
    2298             : 
    2299           0 :     q = (unsigned char *)s;
    2300           0 :     e = q + size;
    2301             : 
    2302           0 :     if (byteorder)
    2303           0 :         bo = *byteorder;
    2304             : 
    2305             :     /* Check for BOM marks (U+FEFF) in the input and adjust current
    2306             :        byte order setting accordingly. In native mode, the leading BOM
    2307             :        mark is skipped, in all other modes, it is copied to the output
    2308             :        stream as-is (giving a ZWNBSP character). */
    2309           0 :     if (bo == 0) {
    2310           0 :         if (size >= 4) {
    2311           0 :             const Py_UCS4 bom = ((unsigned int)q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
    2312           0 :                 (q[iorder[1]] << 8) | q[iorder[0]];
    2313             : #ifdef BYTEORDER_IS_LITTLE_ENDIAN
    2314           0 :             if (bom == 0x0000FEFF) {
    2315           0 :                 q += 4;
    2316           0 :                 bo = -1;
    2317             :             }
    2318           0 :             else if (bom == 0xFFFE0000) {
    2319           0 :                 q += 4;
    2320           0 :                 bo = 1;
    2321             :             }
    2322             : #else
    2323             :             if (bom == 0x0000FEFF) {
    2324             :                 q += 4;
    2325             :                 bo = 1;
    2326             :             }
    2327             :             else if (bom == 0xFFFE0000) {
    2328             :                 q += 4;
    2329             :                 bo = -1;
    2330             :             }
    2331             : #endif
    2332             :         }
    2333             :     }
    2334             : 
    2335           0 :     if (bo == -1) {
    2336             :         /* force LE */
    2337           0 :         iorder[0] = 0;
    2338           0 :         iorder[1] = 1;
    2339           0 :         iorder[2] = 2;
    2340           0 :         iorder[3] = 3;
    2341             :     }
    2342           0 :     else if (bo == 1) {
    2343             :         /* force BE */
    2344           0 :         iorder[0] = 3;
    2345           0 :         iorder[1] = 2;
    2346           0 :         iorder[2] = 1;
    2347           0 :         iorder[3] = 0;
    2348             :     }
    2349             : 
    2350             :     /* On narrow builds we split characters outside the BMP into two
    2351             :        code points => count how much extra space we need. */
    2352             : #ifndef Py_UNICODE_WIDE
    2353           0 :     for (qq = q; e - qq >= 4; qq += 4)
    2354           0 :         if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
    2355           0 :             pairs++;
    2356             : #endif
    2357             : 
    2358             :     /* This might be one to much, because of a BOM */
    2359           0 :     unicode = _PyUnicode_New((size+3)/4+pairs);
    2360           0 :     if (!unicode)
    2361           0 :         return NULL;
    2362           0 :     if (size == 0)
    2363           0 :         return (PyObject *)unicode;
    2364             : 
    2365             :     /* Unpack UTF-32 encoded data */
    2366           0 :     p = unicode->str;
    2367             : 
    2368           0 :     while (q < e) {
    2369             :         Py_UCS4 ch;
    2370             :         /* remaining bytes at the end? (size should be divisible by 4) */
    2371           0 :         if (e-q<4) {
    2372           0 :             if (consumed)
    2373           0 :                 break;
    2374           0 :             errmsg = "truncated data";
    2375           0 :             startinpos = ((const char *)q)-starts;
    2376           0 :             endinpos = ((const char *)e)-starts;
    2377           0 :             goto utf32Error;
    2378             :             /* The remaining input chars are ignored if the callback
    2379             :                chooses to skip the input */
    2380             :         }
    2381           0 :         ch = ((unsigned int)q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
    2382           0 :             (q[iorder[1]] << 8) | q[iorder[0]];
    2383             : 
    2384           0 :         if (ch >= 0x110000)
    2385             :         {
    2386           0 :             errmsg = "code point not in range(0x110000)";
    2387           0 :             startinpos = ((const char *)q)-starts;
    2388           0 :             endinpos = startinpos+4;
    2389           0 :             goto utf32Error;
    2390             :         }
    2391             : #ifndef Py_UNICODE_WIDE
    2392           0 :         if (ch >= 0x10000)
    2393             :         {
    2394           0 :             *p++ = 0xD800 | ((ch-0x10000) >> 10);
    2395           0 :             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
    2396             :         }
    2397             :         else
    2398             : #endif
    2399           0 :             *p++ = ch;
    2400           0 :         q += 4;
    2401           0 :         continue;
    2402             :       utf32Error:
    2403           0 :         outpos = p-PyUnicode_AS_UNICODE(unicode);
    2404           0 :         if (unicode_decode_call_errorhandler(
    2405             :                 errors, &errorHandler,
    2406             :                 "utf32", errmsg,
    2407             :                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
    2408             :                 &unicode, &outpos, &p))
    2409           0 :             goto onError;
    2410             :     }
    2411             : 
    2412           0 :     if (byteorder)
    2413           0 :         *byteorder = bo;
    2414             : 
    2415           0 :     if (consumed)
    2416           0 :         *consumed = (const char *)q-starts;
    2417             : 
    2418             :     /* Adjust length */
    2419           0 :     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
    2420           0 :         goto onError;
    2421             : 
    2422           0 :     Py_XDECREF(errorHandler);
    2423           0 :     Py_XDECREF(exc);
    2424           0 :     return (PyObject *)unicode;
    2425             : 
    2426             :   onError:
    2427           0 :     Py_DECREF(unicode);
    2428           0 :     Py_XDECREF(errorHandler);
    2429           0 :     Py_XDECREF(exc);
    2430           0 :     return NULL;
    2431             : }
    2432             : 
    2433             : PyObject *
    2434           0 : PyUnicode_EncodeUTF32(const Py_UNICODE *s,
    2435             :                       Py_ssize_t size,
    2436             :                       const char *errors,
    2437             :                       int byteorder)
    2438             : {
    2439             :     PyObject *v;
    2440             :     unsigned char *p;
    2441             :     Py_ssize_t nsize, bytesize;
    2442             : #ifndef Py_UNICODE_WIDE
    2443             :     Py_ssize_t i, pairs;
    2444             : #else
    2445             :     const int pairs = 0;
    2446             : #endif
    2447             :     /* Offsets from p for storing byte pairs in the right order. */
    2448             : #ifdef BYTEORDER_IS_LITTLE_ENDIAN
    2449           0 :     int iorder[] = {0, 1, 2, 3};
    2450             : #else
    2451             :     int iorder[] = {3, 2, 1, 0};
    2452             : #endif
    2453             : 
    2454             : #define STORECHAR(CH)                           \
    2455             :     do {                                        \
    2456             :         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
    2457             :         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
    2458             :         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
    2459             :         p[iorder[0]] = (CH) & 0xff;             \
    2460             :         p += 4;                                 \
    2461             :     } while(0)
    2462             : 
    2463             :     /* In narrow builds we can output surrogate pairs as one code point,
    2464             :        so we need less space. */
    2465             : #ifndef Py_UNICODE_WIDE
    2466           0 :     for (i = pairs = 0; i < size-1; i++)
    2467           0 :         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
    2468           0 :             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
    2469           0 :             pairs++;
    2470             : #endif
    2471           0 :     nsize = (size - pairs + (byteorder == 0));
    2472           0 :     bytesize = nsize * 4;
    2473           0 :     if (bytesize / 4 != nsize)
    2474           0 :         return PyErr_NoMemory();
    2475           0 :     v = PyString_FromStringAndSize(NULL, bytesize);
    2476           0 :     if (v == NULL)
    2477           0 :         return NULL;
    2478             : 
    2479           0 :     p = (unsigned char *)PyString_AS_STRING(v);
    2480           0 :     if (byteorder == 0)
    2481           0 :         STORECHAR(0xFEFF);
    2482           0 :     if (size == 0)
    2483           0 :         return v;
    2484             : 
    2485           0 :     if (byteorder == -1) {
    2486             :         /* force LE */
    2487           0 :         iorder[0] = 0;
    2488           0 :         iorder[1] = 1;
    2489           0 :         iorder[2] = 2;
    2490           0 :         iorder[3] = 3;
    2491             :     }
    2492           0 :     else if (byteorder == 1) {
    2493             :         /* force BE */
    2494           0 :         iorder[0] = 3;
    2495           0 :         iorder[1] = 2;
    2496           0 :         iorder[2] = 1;
    2497           0 :         iorder[3] = 0;
    2498             :     }
    2499             : 
    2500           0 :     while (size-- > 0) {
    2501           0 :         Py_UCS4 ch = *s++;
    2502             : #ifndef Py_UNICODE_WIDE
    2503           0 :         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
    2504           0 :             Py_UCS4 ch2 = *s;
    2505           0 :             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
    2506           0 :                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
    2507           0 :                 s++;
    2508           0 :                 size--;
    2509             :             }
    2510             :         }
    2511             : #endif
    2512           0 :         STORECHAR(ch);
    2513             :     }
    2514           0 :     return v;
    2515             : #undef STORECHAR
    2516             : }
    2517             : 
    2518           0 : PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
    2519             : {
    2520           0 :     if (!PyUnicode_Check(unicode)) {
    2521           0 :         PyErr_BadArgument();
    2522           0 :         return NULL;
    2523             :     }
    2524           0 :     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
    2525             :                                  PyUnicode_GET_SIZE(unicode),
    2526             :                                  NULL,
    2527             :                                  0);
    2528             : }
    2529             : 
    2530             : /* --- UTF-16 Codec ------------------------------------------------------- */
    2531             : 
    2532             : PyObject *
    2533           0 : PyUnicode_DecodeUTF16(const char *s,
    2534             :                       Py_ssize_t size,
    2535             :                       const char *errors,
    2536             :                       int *byteorder)
    2537             : {
    2538           0 :     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
    2539             : }
    2540             : 
    2541             : PyObject *
    2542           0 : PyUnicode_DecodeUTF16Stateful(const char *s,
    2543             :                               Py_ssize_t size,
    2544             :                               const char *errors,
    2545             :                               int *byteorder,
    2546             :                               Py_ssize_t *consumed)
    2547             : {
    2548           0 :     const char *starts = s;
    2549             :     Py_ssize_t startinpos;
    2550             :     Py_ssize_t endinpos;
    2551             :     Py_ssize_t outpos;
    2552             :     PyUnicodeObject *unicode;
    2553             :     Py_UNICODE *p;
    2554             :     const unsigned char *q, *e;
    2555           0 :     int bo = 0;       /* assume native ordering by default */
    2556           0 :     const char *errmsg = "";
    2557             :     /* Offsets from q for retrieving byte pairs in the right order. */
    2558             : #ifdef BYTEORDER_IS_LITTLE_ENDIAN
    2559           0 :     int ihi = 1, ilo = 0;
    2560             : #else
    2561             :     int ihi = 0, ilo = 1;
    2562             : #endif
    2563           0 :     PyObject *errorHandler = NULL;
    2564           0 :     PyObject *exc = NULL;
    2565             : 
    2566             :     /* Note: size will always be longer than the resulting Unicode
    2567             :        character count */
    2568           0 :     unicode = _PyUnicode_New(size);
    2569           0 :     if (!unicode)
    2570           0 :         return NULL;
    2571           0 :     if (size == 0)
    2572           0 :         return (PyObject *)unicode;
    2573             : 
    2574             :     /* Unpack UTF-16 encoded data */
    2575           0 :     p = unicode->str;
    2576           0 :     q = (unsigned char *)s;
    2577           0 :     e = q + size;
    2578             : 
    2579           0 :     if (byteorder)
    2580           0 :         bo = *byteorder;
    2581             : 
    2582             :     /* Check for BOM marks (U+FEFF) in the input and adjust current
    2583             :        byte order setting accordingly. In native mode, the leading BOM
    2584             :        mark is skipped, in all other modes, it is copied to the output
    2585             :        stream as-is (giving a ZWNBSP character). */
    2586           0 :     if (bo == 0) {
    2587           0 :         if (size >= 2) {
    2588           0 :             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
    2589             : #ifdef BYTEORDER_IS_LITTLE_ENDIAN
    2590           0 :             if (bom == 0xFEFF) {
    2591           0 :                 q += 2;
    2592           0 :                 bo = -1;
    2593             :             }
    2594           0 :             else if (bom == 0xFFFE) {
    2595           0 :                 q += 2;
    2596           0 :                 bo = 1;
    2597             :             }
    2598             : #else
    2599             :             if (bom == 0xFEFF) {
    2600             :                 q += 2;
    2601             :                 bo = 1;
    2602             :             }
    2603             :             else if (bom == 0xFFFE) {
    2604             :                 q += 2;
    2605             :                 bo = -1;
    2606             :             }
    2607             : #endif
    2608             :         }
    2609             :     }
    2610             : 
    2611           0 :     if (bo == -1) {
    2612             :         /* force LE */
    2613           0 :         ihi = 1;
    2614           0 :         ilo = 0;
    2615             :     }
    2616           0 :     else if (bo == 1) {
    2617             :         /* force BE */
    2618           0 :         ihi = 0;
    2619           0 :         ilo = 1;
    2620             :     }
    2621             : 
    2622           0 :     while (q < e) {
    2623             :         Py_UNICODE ch;
    2624             :         /* remaining bytes at the end? (size should be even) */
    2625           0 :         if (e-q<2) {
    2626           0 :             if (consumed)
    2627           0 :                 break;
    2628           0 :             errmsg = "truncated data";
    2629           0 :             startinpos = ((const char *)q)-starts;
    2630           0 :             endinpos = ((const char *)e)-starts;
    2631           0 :             goto utf16Error;
    2632             :             /* The remaining input chars are ignored if the callback
    2633             :                chooses to skip the input */
    2634             :         }
    2635           0 :         ch = (q[ihi] << 8) | q[ilo];
    2636             : 
    2637           0 :         q += 2;
    2638             : 
    2639           0 :         if (ch < 0xD800 || ch > 0xDFFF) {
    2640           0 :             *p++ = ch;
    2641           0 :             continue;
    2642             :         }
    2643             : 
    2644             :         /* UTF-16 code pair: */
    2645           0 :         if (e - q < 2) {
    2646           0 :             q -= 2;
    2647           0 :             if (consumed)
    2648           0 :                 break;
    2649           0 :             errmsg = "unexpected end of data";
    2650           0 :             startinpos = ((const char *)q)-starts;
    2651           0 :             endinpos = ((const char *)e)-starts;
    2652           0 :             goto utf16Error;
    2653             :         }
    2654           0 :         if (0xD800 <= ch && ch <= 0xDBFF) {
    2655           0 :             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
    2656           0 :             q += 2;
    2657           0 :             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
    2658             : #ifndef Py_UNICODE_WIDE
    2659           0 :                 *p++ = ch;
    2660           0 :                 *p++ = ch2;
    2661             : #else
    2662             :                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
    2663             : #endif
    2664           0 :                 continue;
    2665             :             }
    2666             :             else {
    2667           0 :                 errmsg = "illegal UTF-16 surrogate";
    2668           0 :                 startinpos = (((const char *)q)-4)-starts;
    2669           0 :                 endinpos = startinpos+2;
    2670           0 :                 goto utf16Error;
    2671             :             }
    2672             : 
    2673             :         }
    2674           0 :         errmsg = "illegal encoding";
    2675           0 :         startinpos = (((const char *)q)-2)-starts;
    2676           0 :         endinpos = startinpos+2;
    2677             :         /* Fall through to report the error */
    2678             : 
    2679             :       utf16Error:
    2680           0 :         outpos = p-PyUnicode_AS_UNICODE(unicode);
    2681           0 :         if (unicode_decode_call_errorhandler(
    2682             :                 errors, &errorHandler,
    2683             :                 "utf16", errmsg,
    2684             :                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
    2685             :                 &unicode, &outpos, &p))
    2686           0 :             goto onError;
    2687             :     }
    2688             : 
    2689           0 :     if (byteorder)
    2690           0 :         *byteorder = bo;
    2691             : 
    2692           0 :     if (consumed)
    2693           0 :         *consumed = (const char *)q-starts;
    2694             : 
    2695             :     /* Adjust length */
    2696           0 :     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
    2697           0 :         goto onError;
    2698             : 
    2699           0 :     Py_XDECREF(errorHandler);
    2700           0 :     Py_XDECREF(exc);
    2701           0 :     return (PyObject *)unicode;
    2702             : 
    2703             :   onError:
    2704           0 :     Py_DECREF(unicode);
    2705           0 :     Py_XDECREF(errorHandler);
    2706           0 :     Py_XDECREF(exc);
    2707           0 :     return NULL;
    2708             : }
    2709             : 
    2710             : PyObject *
    2711           0 : PyUnicode_EncodeUTF16(const Py_UNICODE *s,
    2712             :                       Py_ssize_t size,
    2713             :                       const char *errors,
    2714             :                       int byteorder)
    2715             : {
    2716             :     PyObject *v;
    2717             :     unsigned char *p;
    2718             :     Py_ssize_t nsize, bytesize;
    2719             : #ifdef Py_UNICODE_WIDE
    2720             :     Py_ssize_t i, pairs;
    2721             : #else
    2722           0 :     const int pairs = 0;
    2723             : #endif
    2724             :     /* Offsets from p for storing byte pairs in the right order. */
    2725             : #ifdef BYTEORDER_IS_LITTLE_ENDIAN
    2726           0 :     int ihi = 1, ilo = 0;
    2727             : #else
    2728             :     int ihi = 0, ilo = 1;
    2729             : #endif
    2730             : 
    2731             : #define STORECHAR(CH)                           \
    2732             :     do {                                        \
    2733             :         p[ihi] = ((CH) >> 8) & 0xff;            \
    2734             :         p[ilo] = (CH) & 0xff;                   \
    2735             :         p += 2;                                 \
    2736             :     } while(0)
    2737             : 
    2738             : #ifdef Py_UNICODE_WIDE
    2739             :     for (i = pairs = 0; i < size; i++)
    2740             :         if (s[i] >= 0x10000)
    2741             :             pairs++;
    2742             : #endif
    2743             :     /* 2 * (size + pairs + (byteorder == 0)) */
    2744           0 :     if (size > PY_SSIZE_T_MAX ||
    2745           0 :         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
    2746           0 :         return PyErr_NoMemory();
    2747           0 :     nsize = size + pairs + (byteorder == 0);
    2748           0 :     bytesize = nsize * 2;
    2749           0 :     if (bytesize / 2 != nsize)
    2750           0 :         return PyErr_NoMemory();
    2751           0 :     v = PyString_FromStringAndSize(NULL, bytesize);
    2752           0 :     if (v == NULL)
    2753           0 :         return NULL;
    2754             : 
    2755           0 :     p = (unsigned char *)PyString_AS_STRING(v);
    2756           0 :     if (byteorder == 0)
    2757           0 :         STORECHAR(0xFEFF);
    2758           0 :     if (size == 0)
    2759           0 :         return v;
    2760             : 
    2761           0 :     if (byteorder == -1) {
    2762             :         /* force LE */
    2763           0 :         ihi = 1;
    2764           0 :         ilo = 0;
    2765             :     }
    2766           0 :     else if (byteorder == 1) {
    2767             :         /* force BE */
    2768           0 :         ihi = 0;
    2769           0 :         ilo = 1;
    2770             :     }
    2771             : 
    2772           0 :     while (size-- > 0) {
    2773           0 :         Py_UNICODE ch = *s++;
    2774           0 :         Py_UNICODE ch2 = 0;
    2775             : #ifdef Py_UNICODE_WIDE
    2776             :         if (ch >= 0x10000) {
    2777             :             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
    2778             :             ch  = 0xD800 | ((ch-0x10000) >> 10);
    2779             :         }
    2780             : #endif
    2781           0 :         STORECHAR(ch);
    2782           0 :         if (ch2)
    2783           0 :             STORECHAR(ch2);
    2784             :     }
    2785           0 :     return v;
    2786             : #undef STORECHAR
    2787             : }
    2788             : 
    2789           0 : PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
    2790             : {
    2791           0 :     if (!PyUnicode_Check(unicode)) {
    2792           0 :         PyErr_BadArgument();
    2793           0 :         return NULL;
    2794             :     }
    2795           0 :     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
    2796             :                                  PyUnicode_GET_SIZE(unicode),
    2797             :                                  NULL,
    2798             :                                  0);
    2799             : }
    2800             : 
    2801             : /* --- Unicode Escape Codec ----------------------------------------------- */
    2802             : 
    2803             : static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
    2804             : 
    2805           0 : PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
    2806             :                                         Py_ssize_t size,
    2807             :                                         const char *errors)
    2808             : {
    2809           0 :     const char *starts = s;
    2810             :     Py_ssize_t startinpos;
    2811             :     Py_ssize_t endinpos;
    2812             :     Py_ssize_t outpos;
    2813             :     PyUnicodeObject *v;
    2814             :     Py_UNICODE *p;
    2815             :     const char *end;
    2816             :     char* message;
    2817           0 :     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
    2818           0 :     PyObject *errorHandler = NULL;
    2819           0 :     PyObject *exc = NULL;
    2820             : 
    2821             :     /* Escaped strings will always be longer than the resulting
    2822             :        Unicode string, so we start with size here and then reduce the
    2823             :        length after conversion to the true value.
    2824             :        (but if the error callback returns a long replacement string
    2825             :        we'll have to allocate more space) */
    2826           0 :     v = _PyUnicode_New(size);
    2827           0 :     if (v == NULL)
    2828           0 :         goto onError;
    2829           0 :     if (size == 0)
    2830           0 :         return (PyObject *)v;
    2831             : 
    2832           0 :     p = PyUnicode_AS_UNICODE(v);
    2833           0 :     end = s + size;
    2834             : 
    2835           0 :     while (s < end) {
    2836             :         unsigned char c;
    2837             :         Py_UNICODE x;
    2838             :         int digits;
    2839             : 
    2840             :         /* Non-escape characters are interpreted as Unicode ordinals */
    2841           0 :         if (*s != '\\') {
    2842           0 :             *p++ = (unsigned char) *s++;
    2843           0 :             continue;
    2844             :         }
    2845             : 
    2846           0 :         startinpos = s-starts;
    2847             :         /* \ - Escapes */
    2848           0 :         s++;
    2849           0 :         c = *s++;
    2850           0 :         if (s > end)
    2851           0 :             c = '\0'; /* Invalid after \ */
    2852           0 :         switch (c) {
    2853             : 
    2854             :             /* \x escapes */
    2855           0 :         case '\n': break;
    2856           0 :         case '\\': *p++ = '\\'; break;
    2857           0 :         case '\'': *p++ = '\''; break;
    2858           0 :         case '\"': *p++ = '\"'; break;
    2859           0 :         case 'b': *p++ = '\b'; break;
    2860           0 :         case 'f': *p++ = '\014'; break; /* FF */
    2861           0 :         case 't': *p++ = '\t'; break;
    2862           0 :         case 'n': *p++ = '\n'; break;
    2863           0 :         case 'r': *p++ = '\r'; break;
    2864           0 :         case 'v': *p++ = '\013'; break; /* VT */
    2865           0 :         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
    2866             : 
    2867             :             /* \OOO (octal) escapes */
    2868             :         case '0': case '1': case '2': case '3':
    2869             :         case '4': case '5': case '6': case '7':
    2870           0 :             x = s[-1] - '0';
    2871           0 :             if (s < end && '0' <= *s && *s <= '7') {
    2872           0 :                 x = (x<<3) + *s++ - '0';
    2873           0 :                 if (s < end && '0' <= *s && *s <= '7')
    2874           0 :                     x = (x<<3) + *s++ - '0';
    2875             :             }
    2876           0 :             *p++ = x;
    2877           0 :             break;
    2878             : 
    2879             :             /* hex escapes */
    2880             :             /* \xXX */
    2881             :         case 'x':
    2882           0 :             digits = 2;
    2883           0 :             message = "truncated \\xXX escape";
    2884           0 :             goto hexescape;
    2885             : 
    2886             :             /* \uXXXX */
    2887             :         case 'u':
    2888           0 :             digits = 4;
    2889           0 :             message = "truncated \\uXXXX escape";
    2890           0 :             goto hexescape;
    2891             : 
    2892             :             /* \UXXXXXXXX */
    2893             :         case 'U':
    2894           0 :             digits = 8;
    2895           0 :             message = "truncated \\UXXXXXXXX escape";
    2896             :         hexescape:
    2897           0 :             chr = 0;
    2898           0 :             if (end - s < digits) {
    2899             :                 /* count only hex digits */
    2900           0 :                 for (; s < end; ++s) {
    2901           0 :                     c = (unsigned char)*s;
    2902           0 :                     if (!Py_ISXDIGIT(c))
    2903           0 :                         goto error;
    2904             :                 }
    2905           0 :                 goto error;
    2906             :             }
    2907           0 :             for (; digits--; ++s) {
    2908           0 :                 c = (unsigned char)*s;
    2909           0 :                 if (!Py_ISXDIGIT(c))
    2910           0 :                     goto error;
    2911           0 :                 chr = (chr<<4) & ~0xF;
    2912           0 :                 if (c >= '0' && c <= '9')
    2913           0 :                     chr += c - '0';
    2914           0 :                 else if (c >= 'a' && c <= 'f')
    2915           0 :                     chr += 10 + c - 'a';
    2916             :                 else
    2917           0 :                     chr += 10 + c - 'A';
    2918             :             }
    2919           0 :             if (chr == 0xffffffff && PyErr_Occurred())
    2920             :                 /* _decoding_error will have already written into the
    2921             :                    target buffer. */
    2922           0 :                 break;
    2923             :         store:
    2924             :             /* when we get here, chr is a 32-bit unicode character */
    2925           0 :             if (chr <= 0xffff)
    2926             :                 /* UCS-2 character */
    2927           0 :                 *p++ = (Py_UNICODE) chr;
    2928           0 :             else if (chr <= 0x10ffff) {
    2929             :                 /* UCS-4 character. Either store directly, or as
    2930             :                    surrogate pair. */
    2931             : #ifdef Py_UNICODE_WIDE
    2932             :                 *p++ = chr;
    2933             : #else
    2934           0 :                 chr -= 0x10000L;
    2935           0 :                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
    2936           0 :                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
    2937             : #endif
    2938             :             } else {
    2939           0 :                 message = "illegal Unicode character";
    2940           0 :                 goto error;
    2941             :             }
    2942           0 :             break;
    2943             : 
    2944             :             /* \N{name} */
    2945             :         case 'N':
    2946           0 :             message = "malformed \\N character escape";
    2947           0 :             if (ucnhash_CAPI == NULL) {
    2948             :                 /* load the unicode data module */
    2949           0 :                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
    2950           0 :                 if (ucnhash_CAPI == NULL)
    2951           0 :                     goto ucnhashError;
    2952             :             }
    2953           0 :             if (*s == '{') {
    2954           0 :                 const char *start = s+1;
    2955             :                 /* look for the closing brace */
    2956           0 :                 while (*s != '}' && s < end)
    2957           0 :                     s++;
    2958           0 :                 if (s > start && s < end && *s == '}') {
    2959             :                     /* found a name.  look it up in the unicode database */
    2960           0 :                     message = "unknown Unicode character name";
    2961           0 :                     s++;
    2962           0 :                     if (s - start - 1 <= INT_MAX &&
    2963           0 :                         ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
    2964           0 :                         goto store;
    2965             :                 }
    2966             :             }
    2967           0 :             goto error;
    2968             : 
    2969             :         default:
    2970           0 :             if (s > end) {
    2971           0 :                 message = "\\ at end of string";
    2972           0 :                 s--;
    2973           0 :                 goto error;
    2974             :             }
    2975             :             else {
    2976           0 :                 *p++ = '\\';
    2977           0 :                 *p++ = (unsigned char)s[-1];
    2978             :             }
    2979           0 :             break;
    2980             :         }
    2981           0 :         continue;
    2982             : 
    2983             :       error:
    2984           0 :         endinpos = s-starts;
    2985           0 :         outpos = p-PyUnicode_AS_UNICODE(v);
    2986           0 :         if (unicode_decode_call_errorhandler(
    2987             :                 errors, &errorHandler,
    2988             :                 "unicodeescape", message,
    2989             :                 starts, size, &startinpos, &endinpos, &exc, &s,
    2990             :                 &v, &outpos, &p))
    2991           0 :             goto onError;
    2992           0 :         continue;
    2993             :     }
    2994           0 :     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
    2995           0 :         goto onError;
    2996           0 :     Py_XDECREF(errorHandler);
    2997           0 :     Py_XDECREF(exc);
    2998           0 :     return (PyObject *)v;
    2999             : 
    3000             :   ucnhashError:
    3001           0 :     PyErr_SetString(
    3002             :         PyExc_UnicodeError,
    3003             :         "\\N escapes not supported (can't load unicodedata module)"
    3004             :         );
    3005           0 :     Py_XDECREF(v);
    3006           0 :     Py_XDECREF(errorHandler);
    3007           0 :     Py_XDECREF(exc);
    3008           0 :     return NULL;
    3009             : 
    3010             :   onError:
    3011           0 :     Py_XDECREF(v);
    3012           0 :     Py_XDECREF(errorHandler);
    3013           0 :     Py_XDECREF(exc);
    3014           0 :     return NULL;
    3015             : }
    3016             : 
    3017             : /* Return a Unicode-Escape string version of the Unicode object.
    3018             : 
    3019             :    If quotes is true, the string is enclosed in u"" or u'' quotes as
    3020             :    appropriate.
    3021             : 
    3022             : */
    3023             : 
    3024           0 : Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
    3025             :                                              Py_ssize_t size,
    3026             :                                              Py_UNICODE ch)
    3027             : {
    3028             :     /* like wcschr, but doesn't stop at NULL characters */
    3029             : 
    3030           0 :     while (size-- > 0) {
    3031           0 :         if (*s == ch)
    3032           0 :             return s;
    3033           0 :         s++;
    3034             :     }
    3035             : 
    3036           0 :     return NULL;
    3037             : }
    3038             : 
    3039             : static
    3040           0 : PyObject *unicodeescape_string(const Py_UNICODE *s,
    3041             :                                Py_ssize_t size,
    3042             :                                int quotes)
    3043             : {
    3044             :     PyObject *repr;
    3045             :     char *p;
    3046             : 
    3047             :     static const char *hexdigit = "0123456789abcdef";
    3048             : #ifdef Py_UNICODE_WIDE
    3049             :     const Py_ssize_t expandsize = 10;
    3050             : #else
    3051           0 :     const Py_ssize_t expandsize = 6;
    3052             : #endif
    3053             : 
    3054             :     /* XXX(nnorwitz): rather than over-allocating, it would be
    3055             :        better to choose a different scheme.  Perhaps scan the
    3056             :        first N-chars of the string and allocate based on that size.
    3057             :     */
    3058             :     /* Initial allocation is based on the longest-possible unichr
    3059             :        escape.
    3060             : 
    3061             :        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
    3062             :        unichr, so in this case it's the longest unichr escape. In
    3063             :        narrow (UTF-16) builds this is five chars per source unichr
    3064             :        since there are two unichrs in the surrogate pair, so in narrow
    3065             :        (UTF-16) builds it's not the longest unichr escape.
    3066             : 
    3067             :        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
    3068             :        so in the narrow (UTF-16) build case it's the longest unichr
    3069             :        escape.
    3070             :     */
    3071             : 
    3072           0 :     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
    3073           0 :         return PyErr_NoMemory();
    3074             : 
    3075           0 :     repr = PyString_FromStringAndSize(NULL,
    3076             :                                       2
    3077           0 :                                       + expandsize*size
    3078             :                                       + 1);
    3079           0 :     if (repr == NULL)
    3080           0 :         return NULL;
    3081             : 
    3082           0 :     p = PyString_AS_STRING(repr);
    3083             : 
    3084           0 :     if (quotes) {
    3085           0 :         *p++ = 'u';
    3086           0 :         *p++ = (findchar(s, size, '\'') &&
    3087           0 :                 !findchar(s, size, '"')) ? '"' : '\'';
    3088             :     }
    3089           0 :     while (size-- > 0) {
    3090           0 :         Py_UNICODE ch = *s++;
    3091             : 
    3092             :         /* Escape quotes and backslashes */
    3093           0 :         if ((quotes &&
    3094           0 :              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
    3095           0 :             *p++ = '\\';
    3096           0 :             *p++ = (char) ch;
    3097           0 :             continue;
    3098             :         }
    3099             : 
    3100             : #ifdef Py_UNICODE_WIDE
    3101             :         /* Map 21-bit characters to '\U00xxxxxx' */
    3102             :         else if (ch >= 0x10000) {
    3103             :             *p++ = '\\';
    3104             :             *p++ = 'U';
    3105             :             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
    3106             :             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
    3107             :             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
    3108             :             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
    3109             :             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
    3110             :             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
    3111             :             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
    3112             :             *p++ = hexdigit[ch & 0x0000000F];
    3113             :             continue;
    3114             :         }
    3115             : #else
    3116             :         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
    3117           0 :         else if (ch >= 0xD800 && ch < 0xDC00) {
    3118             :             Py_UNICODE ch2;
    3119             :             Py_UCS4 ucs;
    3120             : 
    3121           0 :             ch2 = *s++;
    3122           0 :             size--;
    3123           0 :             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
    3124           0 :                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
    3125           0 :                 *p++ = '\\';
    3126           0 :                 *p++ = 'U';
    3127           0 :                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
    3128           0 :                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
    3129           0 :                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
    3130           0 :                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
    3131           0 :                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
    3132           0 :                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
    3133           0 :                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
    3134           0 :                 *p++ = hexdigit[ucs & 0x0000000F];
    3135           0 :                 continue;
    3136             :             }
    3137             :             /* Fall through: isolated surrogates are copied as-is */
    3138           0 :             s--;
    3139           0 :             size++;
    3140             :         }
    3141             : #endif
    3142             : 
    3143             :         /* Map 16-bit characters to '\uxxxx' */
    3144           0 :         if (ch >= 256) {
    3145           0 :             *p++ = '\\';
    3146           0 :             *p++ = 'u';
    3147           0 :             *p++ = hexdigit[(ch >> 12) & 0x000F];
    3148           0 :             *p++ = hexdigit[(ch >> 8) & 0x000F];
    3149           0 :             *p++ = hexdigit[(ch >> 4) & 0x000F];
    3150           0 :             *p++ = hexdigit[ch & 0x000F];
    3151             :         }
    3152             : 
    3153             :         /* Map special whitespace to '\t', \n', '\r' */
    3154           0 :         else if (ch == '\t') {
    3155           0 :             *p++ = '\\';
    3156           0 :             *p++ = 't';
    3157             :         }
    3158           0 :         else if (ch == '\n') {
    3159           0 :             *p++ = '\\';
    3160           0 :             *p++ = 'n';
    3161             :         }
    3162           0 :         else if (ch == '\r') {
    3163           0 :             *p++ = '\\';
    3164           0 :             *p++ = 'r';
    3165             :         }
    3166             : 
    3167             :         /* Map non-printable US ASCII to '\xhh' */
    3168           0 :         else if (ch < ' ' || ch >= 0x7F) {
    3169           0 :             *p++ = '\\';
    3170           0 :             *p++ = 'x';
    3171           0 :             *p++ = hexdigit[(ch >> 4) & 0x000F];
    3172           0 :             *p++ = hexdigit[ch & 0x000F];
    3173             :         }
    3174             : 
    3175             :         /* Copy everything else as-is */
    3176             :         else
    3177           0 :             *p++ = (char) ch;
    3178             :     }
    3179           0 :     if (quotes)
    3180           0 :         *p++ = PyString_AS_STRING(repr)[1];
    3181             : 
    3182           0 :     *p = '\0';
    3183           0 :     if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
    3184           0 :         return NULL;
    3185           0 :     return repr;
    3186             : }
    3187             : 
    3188           0 : PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
    3189             :                                         Py_ssize_t size)
    3190             : {
    3191           0 :     return unicodeescape_string(s, size, 0);
    3192             : }
    3193             : 
    3194           0 : PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
    3195             : {
    3196           0 :     if (!PyUnicode_Check(unicode)) {
    3197           0 :         PyErr_BadArgument();
    3198           0 :         return NULL;
    3199             :     }
    3200           0 :     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
    3201             :                                          PyUnicode_GET_SIZE(unicode));
    3202             : }
    3203             : 
    3204             : /* --- Raw Unicode Escape Codec ------------------------------------------- */
    3205             : 
    3206           0 : PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
    3207             :                                            Py_ssize_t size,
    3208             :                                            const char *errors)
    3209             : {
    3210           0 :     const char *starts = s;
    3211             :     Py_ssize_t startinpos;
    3212             :     Py_ssize_t endinpos;
    3213             :     Py_ssize_t outpos;
    3214             :     PyUnicodeObject *v;
    3215             :     Py_UNICODE *p;
    3216             :     const char *end;
    3217             :     const char *bs;
    3218           0 :     PyObject *errorHandler = NULL;
    3219           0 :     PyObject *exc = NULL;
    3220             : 
    3221             :     /* Escaped strings will always be longer than the resulting
    3222             :        Unicode string, so we start with size here and then reduce the
    3223             :        length after conversion to the true value. (But decoding error
    3224             :        handler might have to resize the string) */
    3225           0 :     v = _PyUnicode_New(size);
    3226           0 :     if (v == NULL)
    3227           0 :         goto onError;
    3228           0 :     if (size == 0)
    3229           0 :         return (PyObject *)v;
    3230           0 :     p = PyUnicode_AS_UNICODE(v);
    3231           0 :     end = s + size;
    3232           0 :     while (s < end) {
    3233             :         unsigned char c;
    3234             :         Py_UCS4 x;
    3235             :         int i;
    3236             :         int count;
    3237             : 
    3238             :         /* Non-escape characters are interpreted as Unicode ordinals */
    3239           0 :         if (*s != '\\') {
    3240           0 :             *p++ = (unsigned char)*s++;
    3241           0 :             continue;
    3242             :         }
    3243           0 :         startinpos = s-starts;
    3244             : 
    3245             :         /* \u-escapes are only interpreted iff the number of leading
    3246             :            backslashes if odd */
    3247           0 :         bs = s;
    3248           0 :         for (;s < end;) {
    3249           0 :             if (*s != '\\')
    3250           0 :                 break;
    3251           0 :             *p++ = (unsigned char)*s++;
    3252             :         }
    3253           0 :         if (((s - bs) & 1) == 0 ||
    3254           0 :             s >= end ||
    3255           0 :             (*s != 'u' && *s != 'U')) {
    3256           0 :             continue;
    3257             :         }
    3258           0 :         p--;
    3259           0 :         count = *s=='u' ? 4 : 8;
    3260           0 :         s++;
    3261             : 
    3262             :         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
    3263           0 :         outpos = p-PyUnicode_AS_UNICODE(v);
    3264           0 :         for (x = 0, i = 0; i < count; ++i, ++s) {
    3265           0 :             c = (unsigned char)*s;
    3266           0 :             if (!isxdigit(c)) {
    3267           0 :                 endinpos = s-starts;
    3268           0 :                 if (unicode_decode_call_errorhandler(
    3269             :                         errors, &errorHandler,
    3270             :                         "rawunicodeescape", "truncated \\uXXXX",
    3271             :                         starts, size, &startinpos, &endinpos, &exc, &s,
    3272             :                         &v, &outpos, &p))
    3273           0 :                     goto onError;
    3274           0 :                 goto nextByte;
    3275             :             }
    3276           0 :             x = (x<<4) & ~0xF;
    3277           0 :             if (c >= '0' && c <= '9')
    3278           0 :                 x += c - '0';
    3279           0 :             else if (c >= 'a' && c <= 'f')
    3280           0 :                 x += 10 + c - 'a';
    3281             :             else
    3282           0 :                 x += 10 + c - 'A';
    3283             :         }
    3284           0 :         if (x <= 0xffff)
    3285             :             /* UCS-2 character */
    3286           0 :             *p++ = (Py_UNICODE) x;
    3287           0 :         else if (x <= 0x10ffff) {
    3288             :             /* UCS-4 character. Either store directly, or as
    3289             :                surrogate pair. */
    3290             : #ifdef Py_UNICODE_WIDE
    3291             :             *p++ = (Py_UNICODE) x;
    3292             : #else
    3293           0 :             x -= 0x10000L;
    3294           0 :             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
    3295           0 :             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
    3296             : #endif
    3297             :         } else {
    3298           0 :             endinpos = s-starts;
    3299           0 :             outpos = p-PyUnicode_AS_UNICODE(v);
    3300           0 :             if (unicode_decode_call_errorhandler(
    3301             :                     errors, &errorHandler,
    3302             :                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
    3303             :                     starts, size, &startinpos, &endinpos, &exc, &s,
    3304             :                     &v, &outpos, &p))
    3305           0 :                 goto onError;
    3306             :         }
    3307             :       nextByte:
    3308             :         ;
    3309             :     }
    3310           0 :     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
    3311           0 :         goto onError;
    3312           0 :     Py_XDECREF(errorHandler);
    3313           0 :     Py_XDECREF(exc);
    3314           0 :     return (PyObject *)v;
    3315             : 
    3316             :   onError:
    3317           0 :     Py_XDECREF(v);
    3318           0 :     Py_XDECREF(errorHandler);
    3319           0 :     Py_XDECREF(exc);
    3320           0 :     return NULL;
    3321             : }
    3322             : 
    3323           0 : PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
    3324             :                                            Py_ssize_t size)
    3325             : {
    3326             :     PyObject *repr;
    3327             :     char *p;
    3328             :     char *q;
    3329             : 
    3330             :     static const char *hexdigit = "0123456789abcdef";
    3331             : #ifdef Py_UNICODE_WIDE
    3332             :     const Py_ssize_t expandsize = 10;
    3333             : #else
    3334           0 :     const Py_ssize_t expandsize = 6;
    3335             : #endif
    3336             : 
    3337           0 :     if (size > PY_SSIZE_T_MAX / expandsize)
    3338           0 :         return PyErr_NoMemory();
    3339             : 
    3340           0 :     repr = PyString_FromStringAndSize(NULL, expandsize * size);
    3341           0 :     if (repr == NULL)
    3342           0 :         return NULL;
    3343           0 :     if (size == 0)
    3344           0 :         return repr;
    3345             : 
    3346           0 :     p = q = PyString_AS_STRING(repr);
    3347           0 :     while (size-- > 0) {
    3348           0 :         Py_UNICODE ch = *s++;
    3349             : #ifdef Py_UNICODE_WIDE
    3350             :         /* Map 32-bit characters to '\Uxxxxxxxx' */
    3351             :         if (ch >= 0x10000) {
    3352             :             *p++ = '\\';
    3353             :             *p++ = 'U';
    3354             :             *p++ = hexdigit[(ch >> 28) & 0xf];
    3355             :             *p++ = hexdigit[(ch >> 24) & 0xf];
    3356             :             *p++ = hexdigit[(ch >> 20) & 0xf];
    3357             :             *p++ = hexdigit[(ch >> 16) & 0xf];
    3358             :             *p++ = hexdigit[(ch >> 12) & 0xf];
    3359             :             *p++ = hexdigit[(ch >> 8) & 0xf];
    3360             :             *p++ = hexdigit[(ch >> 4) & 0xf];
    3361             :             *p++ = hexdigit[ch & 15];
    3362             :         }
    3363             :         else
    3364             : #else
    3365             :             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
    3366           0 :             if (ch >= 0xD800 && ch < 0xDC00) {
    3367             :                 Py_UNICODE ch2;
    3368             :                 Py_UCS4 ucs;
    3369             : 
    3370           0 :                 ch2 = *s++;
    3371           0 :                 size--;
    3372           0 :                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
    3373           0 :                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
    3374           0 :                     *p++ = '\\';
    3375           0 :                     *p++ = 'U';
    3376           0 :                     *p++ = hexdigit[(ucs >> 28) & 0xf];
    3377           0 :                     *p++ = hexdigit[(ucs >> 24) & 0xf];
    3378           0 :                     *p++ = hexdigit[(ucs >> 20) & 0xf];
    3379           0 :                     *p++ = hexdigit[(ucs >> 16) & 0xf];
    3380           0 :                     *p++ = hexdigit[(ucs >> 12) & 0xf];
    3381           0 :                     *p++ = hexdigit[(ucs >> 8) & 0xf];
    3382           0 :                     *p++ = hexdigit[(ucs >> 4) & 0xf];
    3383           0 :                     *p++ = hexdigit[ucs & 0xf];
    3384           0 :                     continue;
    3385             :                 }
    3386             :                 /* Fall through: isolated surrogates are copied as-is */
    3387           0 :                 s--;
    3388           0 :                 size++;
    3389             :             }
    3390             : #endif
    3391             :         /* Map 16-bit characters to '\uxxxx' */
    3392           0 :         if (ch >= 256) {
    3393           0 :             *p++ = '\\';
    3394           0 :             *p++ = 'u';
    3395           0 :             *p++ = hexdigit[(ch >> 12) & 0xf];
    3396           0 :             *p++ = hexdigit[(ch >> 8) & 0xf];
    3397           0 :             *p++ = hexdigit[(ch >> 4) & 0xf];
    3398           0 :             *p++ = hexdigit[ch & 15];
    3399             :         }
    3400             :         /* Copy everything else as-is */
    3401             :         else
    3402           0 :             *p++ = (char) ch;
    3403             :     }
    3404           0 :     *p = '\0';
    3405           0 :     if (_PyString_Resize(&repr, p - q))
    3406           0 :         return NULL;
    3407           0 :     return repr;
    3408             : }
    3409             : 
    3410           0 : PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
    3411             : {
    3412           0 :     if (!PyUnicode_Check(unicode)) {
    3413           0 :         PyErr_BadArgument();
    3414           0 :         return NULL;
    3415             :     }
    3416           0 :     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
    3417             :                                             PyUnicode_GET_SIZE(unicode));
    3418             : }
    3419             : 
    3420             : /* --- Unicode Internal Codec ------------------------------------------- */
    3421             : 
    3422           0 : PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
    3423             :                                            Py_ssize_t size,
    3424             :                                            const char *errors)
    3425             : {
    3426           0 :     const char *starts = s;
    3427             :     Py_ssize_t startinpos;
    3428             :     Py_ssize_t endinpos;
    3429             :     Py_ssize_t outpos;
    3430             :     PyUnicodeObject *v;
    3431             :     Py_UNICODE *p;
    3432             :     const char *end;
    3433             :     const char *reason;
    3434           0 :     PyObject *errorHandler = NULL;
    3435           0 :     PyObject *exc = NULL;
    3436             : 
    3437             : #ifdef Py_UNICODE_WIDE
    3438             :     Py_UNICODE unimax = PyUnicode_GetMax();
    3439             : #endif
    3440             : 
    3441             :     /* XXX overflow detection missing */
    3442           0 :     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
    3443           0 :     if (v == NULL)
    3444           0 :         goto onError;
    3445           0 :     if (PyUnicode_GetSize((PyObject *)v) == 0)
    3446           0 :         return (PyObject *)v;
    3447           0 :     p = PyUnicode_AS_UNICODE(v);
    3448           0 :     end = s + size;
    3449             : 
    3450           0 :     while (s < end) {
    3451           0 :         if (end-s < Py_UNICODE_SIZE) {
    3452           0 :             endinpos = end-starts;
    3453           0 :             reason = "truncated input";
    3454           0 :             goto error;
    3455             :         }
    3456           0 :         memcpy(p, s, sizeof(Py_UNICODE));
    3457             : #ifdef Py_UNICODE_WIDE
    3458             :         /* We have to sanity check the raw data, otherwise doom looms for
    3459             :            some malformed UCS-4 data. */
    3460             :         if (*p > unimax || *p < 0) {
    3461             :             endinpos = s - starts + Py_UNICODE_SIZE;
    3462             :             reason = "illegal code point (> 0x10FFFF)";
    3463             :             goto error;
    3464             :         }
    3465             : #endif
    3466           0 :         p++;
    3467           0 :         s += Py_UNICODE_SIZE;
    3468           0 :         continue;
    3469             : 
    3470             :   error:
    3471           0 :         startinpos = s - starts;
    3472           0 :         outpos = p - PyUnicode_AS_UNICODE(v);
    3473           0 :         if (unicode_decode_call_errorhandler(
    3474             :                 errors, &errorHandler,
    3475             :                 "unicode_internal", reason,
    3476             :                 starts, size, &startinpos, &endinpos, &exc, &s,
    3477             :                 &v, &outpos, &p)) {
    3478           0 :             goto onError;
    3479             :         }
    3480             :     }
    3481             : 
    3482           0 :     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
    3483           0 :         goto onError;
    3484           0 :     Py_XDECREF(errorHandler);
    3485           0 :     Py_XDECREF(exc);
    3486           0 :     return (PyObject *)v;
    3487             : 
    3488             :   onError:
    3489           0 :     Py_XDECREF(v);
    3490           0 :     Py_XDECREF(errorHandler);
    3491           0 :     Py_XDECREF(exc);
    3492           0 :     return NULL;
    3493             : }
    3494             : 
    3495             : /* --- Latin-1 Codec ------------------------------------------------------ */
    3496             : 
    3497           0 : PyObject *PyUnicode_DecodeLatin1(const char *s,
    3498             :                                  Py_ssize_t size,
    3499             :                                  const char *errors)
    3500             : {
    3501             :     PyUnicodeObject *v;
    3502             :     Py_UNICODE *p;
    3503             : 
    3504             :     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
    3505           0 :     if (size == 1) {
    3506           0 :         Py_UNICODE r = *(unsigned char*)s;
    3507           0 :         return PyUnicode_FromUnicode(&r, 1);
    3508             :     }
    3509             : 
    3510           0 :     v = _PyUnicode_New(size);
    3511           0 :     if (v == NULL)
    3512           0 :         goto onError;
    3513           0 :     if (size == 0)
    3514           0 :         return (PyObject *)v;
    3515           0 :     p = PyUnicode_AS_UNICODE(v);
    3516           0 :     while (size-- > 0)
    3517           0 :         *p++ = (unsigned char)*s++;
    3518           0 :     return (PyObject *)v;
    3519             : 
    3520             :   onError:
    3521           0 :     Py_XDECREF(v);
    3522           0 :     return NULL;
    3523             : }
    3524             : 
    3525             : /* create or adjust a UnicodeEncodeError */
    3526           0 : static void make_encode_exception(PyObject **exceptionObject,
    3527             :                                   const char *encoding,
    3528             :                                   const Py_UNICODE *unicode, Py_ssize_t size,
    3529             :                                   Py_ssize_t startpos, Py_ssize_t endpos,
    3530             :                                   const char *reason)
    3531             : {
    3532           0 :     if (*exceptionObject == NULL) {
    3533           0 :         *exceptionObject = PyUnicodeEncodeError_Create(
    3534             :             encoding, unicode, size, startpos, endpos, reason);
    3535             :     }
    3536             :     else {
    3537           0 :         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
    3538           0 :             goto onError;
    3539           0 :         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
    3540           0 :             goto onError;
    3541           0 :         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
    3542           0 :             goto onError;
    3543           0 :         return;
    3544             :       onError:
    3545           0 :         Py_CLEAR(*exceptionObject);
    3546             :     }
    3547             : }
    3548             : 
    3549             : /* raises a UnicodeEncodeError */
    3550           0 : static void raise_encode_exception(PyObject **exceptionObject,
    3551             :                                    const char *encoding,
    3552             :                                    const Py_UNICODE *unicode, Py_ssize_t size,
    3553             :                                    Py_ssize_t startpos, Py_ssize_t endpos,
    3554             :                                    const char *reason)
    3555             : {
    3556           0 :     make_encode_exception(exceptionObject,
    3557             :                           encoding, unicode, size, startpos, endpos, reason);
    3558           0 :     if (*exceptionObject != NULL)
    3559           0 :         PyCodec_StrictErrors(*exceptionObject);
    3560           0 : }
    3561             : 
    3562             : /* error handling callback helper:
    3563             :    build arguments, call the callback and check the arguments,
    3564             :    put the result into newpos and return the replacement string, which
    3565             :    has to be freed by the caller */
    3566           0 : static PyObject *unicode_encode_call_errorhandler(const char *errors,
    3567             :                                                   PyObject **errorHandler,
    3568             :                                                   const char *encoding, const char *reason,
    3569             :                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
    3570             :                                                   Py_ssize_t startpos, Py_ssize_t endpos,
    3571             :                                                   Py_ssize_t *newpos)
    3572             : {
    3573             :     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
    3574             : 
    3575             :     PyObject *restuple;
    3576             :     PyObject *resunicode;
    3577             : 
    3578           0 :     if (*errorHandler == NULL) {
    3579           0 :         *errorHandler = PyCodec_LookupError(errors);
    3580           0 :         if (*errorHandler == NULL)
    3581           0 :             return NULL;
    3582             :     }
    3583             : 
    3584           0 :     make_encode_exception(exceptionObject,
    3585             :                           encoding, unicode, size, startpos, endpos, reason);
    3586           0 :     if (*exceptionObject == NULL)
    3587           0 :         return NULL;
    3588             : 
    3589           0 :     restuple = PyObject_CallFunctionObjArgs(
    3590             :         *errorHandler, *exceptionObject, NULL);
    3591           0 :     if (restuple == NULL)
    3592           0 :         return NULL;
    3593           0 :     if (!PyTuple_Check(restuple)) {
    3594           0 :         PyErr_SetString(PyExc_TypeError, &argparse[4]);
    3595           0 :         Py_DECREF(restuple);
    3596           0 :         return NULL;
    3597             :     }
    3598           0 :     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
    3599             :                           &resunicode, newpos)) {
    3600           0 :         Py_DECREF(restuple);
    3601           0 :         return NULL;
    3602             :     }
    3603           0 :     if (*newpos<0)
    3604           0 :         *newpos = size+*newpos;
    3605           0 :     if (*newpos<0 || *newpos>size) {
    3606           0 :         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
    3607           0 :         Py_DECREF(restuple);
    3608           0 :         return NULL;
    3609             :     }
    3610           0 :     Py_INCREF(resunicode);
    3611           0 :     Py_DECREF(restuple);
    3612           0 :     return resunicode;
    3613             : }
    3614             : 
    3615           0 : static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
    3616             :                                      Py_ssize_t size,
    3617             :                                      const char *errors,
    3618             :                                      int limit)
    3619             : {
    3620             :     /* output object */
    3621             :     PyObject *res;
    3622             :     /* pointers to the beginning and end+1 of input */
    3623           0 :     const Py_UNICODE *startp = p;
    3624           0 :     const Py_UNICODE *endp = p + size;
    3625             :     /* pointer to the beginning of the unencodable characters */
    3626             :     /* const Py_UNICODE *badp = NULL; */
    3627             :     /* pointer into the output */
    3628             :     char *str;
    3629             :     /* current output position */
    3630           0 :     Py_ssize_t respos = 0;
    3631             :     Py_ssize_t ressize;
    3632           0 :     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
    3633           0 :     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
    3634           0 :     PyObject *errorHandler = NULL;
    3635           0 :     PyObject *exc = NULL;
    3636             :     /* the following variable is used for caching string comparisons
    3637             :      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
    3638           0 :     int known_errorHandler = -1;
    3639             : 
    3640             :     /* allocate enough for a simple encoding without
    3641             :        replacements, if we need more, we'll resize */
    3642           0 :     res = PyString_FromStringAndSize(NULL, size);
    3643           0 :     if (res == NULL)
    3644           0 :         goto onError;
    3645           0 :     if (size == 0)
    3646           0 :         return res;
    3647           0 :     str = PyString_AS_STRING(res);
    3648           0 :     ressize = size;
    3649             : 
    3650           0 :     while (p<endp) {
    3651           0 :         Py_UNICODE c = *p;
    3652             : 
    3653             :         /* can we encode this? */
    3654           0 :         if (c<limit) {
    3655             :             /* no overflow check, because we know that the space is enough */
    3656           0 :             *str++ = (char)c;
    3657           0 :             ++p;
    3658             :         }
    3659             :         else {
    3660           0 :             Py_ssize_t unicodepos = p-startp;
    3661             :             Py_ssize_t requiredsize;
    3662             :             PyObject *repunicode;
    3663             :             Py_ssize_t repsize;
    3664             :             Py_ssize_t newpos;
    3665             :             Py_ssize_t respos;
    3666             :             Py_UNICODE *uni2;
    3667             :             /* startpos for collecting unencodable chars */
    3668           0 :             const Py_UNICODE *collstart = p;
    3669           0 :             const Py_UNICODE *collend = p;
    3670             :             /* find all unecodable characters */
    3671           0 :             while ((collend < endp) && ((*collend) >= limit))
    3672           0 :                 ++collend;
    3673             :             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
    3674           0 :             if (known_errorHandler==-1) {
    3675           0 :                 if ((errors==NULL) || (!strcmp(errors, "strict")))
    3676           0 :                     known_errorHandler = 1;
    3677           0 :                 else if (!strcmp(errors, "replace"))
    3678           0 :                     known_errorHandler = 2;
    3679           0 :                 else if (!strcmp(errors, "ignore"))
    3680           0 :                     known_errorHandler = 3;
    3681           0 :                 else if (!strcmp(errors, "xmlcharrefreplace"))
    3682           0 :                     known_errorHandler = 4;
    3683             :                 else
    3684           0 :                     known_errorHandler = 0;
    3685             :             }
    3686           0 :             switch (known_errorHandler) {
    3687             :             case 1: /* strict */
    3688           0 :                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
    3689           0 :                 goto onError;
    3690             :             case 2: /* replace */
    3691           0 :                 while (collstart++ < collend)
    3692           0 :                     *str++ = '?'; /* fall through */
    3693             :             case 3: /* ignore */
    3694           0 :                 p = collend;
    3695           0 :                 break;
    3696             :             case 4: /* xmlcharrefreplace */
    3697           0 :                 respos = str - PyString_AS_STRING(res);
    3698             :                 /* determine replacement size (temporarily (mis)uses p) */
    3699           0 :                 requiredsize = respos;
    3700           0 :                 for (p = collstart; p < collend;) {
    3701           0 :                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
    3702             :                     Py_ssize_t incr;
    3703           0 :                     if (ch < 10)
    3704           0 :                         incr = 2+1+1;
    3705           0 :                     else if (ch < 100)
    3706           0 :                         incr = 2+2+1;
    3707           0 :                     else if (ch < 1000)
    3708           0 :                         incr = 2+3+1;
    3709           0 :                     else if (ch < 10000)
    3710           0 :                         incr = 2+4+1;
    3711           0 :                     else if (ch < 100000)
    3712           0 :                         incr = 2+5+1;
    3713           0 :                     else if (ch < 1000000)
    3714           0 :                         incr = 2+6+1;
    3715             :                     else
    3716           0 :                         incr = 2+7+1;
    3717           0 :                     if (requiredsize > PY_SSIZE_T_MAX - incr)
    3718           0 :                         goto overflow;
    3719           0 :                     requiredsize += incr;
    3720             :                 }
    3721           0 :                 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
    3722           0 :                     goto overflow;
    3723           0 :                 requiredsize += endp - collend;
    3724           0 :                 if (requiredsize > ressize) {
    3725           0 :                     if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
    3726           0 :                         requiredsize = 2*ressize;
    3727           0 :                     if (_PyString_Resize(&res, requiredsize))
    3728           0 :                         goto onError;
    3729           0 :                     str = PyString_AS_STRING(res) + respos;
    3730           0 :                     ressize = requiredsize;
    3731             :                 }
    3732             :                 /* generate replacement (temporarily (mis)uses p) */
    3733           0 :                 for (p = collstart; p < collend;) {
    3734           0 :                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
    3735           0 :                     str += sprintf(str, "&#%d;", (int)ch);
    3736             :                 }
    3737           0 :                 p = collend;
    3738           0 :                 break;
    3739             :             default:
    3740           0 :                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
    3741             :                                                               encoding, reason, startp, size, &exc,
    3742           0 :                                                               collstart-startp, collend-startp, &newpos);
    3743           0 :                 if (repunicode == NULL)
    3744           0 :                     goto onError;
    3745             :                 /* need more space? (at least enough for what we have+the
    3746             :                    replacement+the rest of the string, so we won't have to
    3747             :                    check space for encodable characters) */
    3748           0 :                 respos = str - PyString_AS_STRING(res);
    3749           0 :                 repsize = PyUnicode_GET_SIZE(repunicode);
    3750           0 :                 if (respos > PY_SSIZE_T_MAX - repsize)
    3751           0 :                     goto overflow;
    3752           0 :                 requiredsize = respos + repsize;
    3753           0 :                 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
    3754           0 :                     goto overflow;
    3755           0 :                 requiredsize += endp - collend;
    3756           0 :                 if (requiredsize > ressize) {
    3757           0 :                     if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
    3758           0 :                         requiredsize = 2*ressize;
    3759           0 :                     if (_PyString_Resize(&res, requiredsize)) {
    3760           0 :                         Py_DECREF(repunicode);
    3761           0 :                         goto onError;
    3762             :                     }
    3763           0 :                     str = PyString_AS_STRING(res) + respos;
    3764           0 :                     ressize = requiredsize;
    3765             :                 }
    3766             :                 /* check if there is anything unencodable in the replacement
    3767             :                    and copy it to the output */
    3768           0 :                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
    3769           0 :                     c = *uni2;
    3770           0 :                     if (c >= limit) {
    3771           0 :                         raise_encode_exception(&exc, encoding, startp, size,
    3772             :                                                unicodepos, unicodepos+1, reason);
    3773           0 :                         Py_DECREF(repunicode);
    3774           0 :                         goto onError;
    3775             :                     }
    3776           0 :                     *str = (char)c;
    3777             :                 }
    3778           0 :                 p = startp + newpos;
    3779           0 :                 Py_DECREF(repunicode);
    3780             :             }
    3781             :         }
    3782             :     }
    3783             :     /* Resize if we allocated to much */
    3784           0 :     respos = str - PyString_AS_STRING(res);
    3785           0 :     if (respos < ressize)
    3786             :         /* If this falls res will be NULL */
    3787           0 :         _PyString_Resize(&res, respos);
    3788           0 :     Py_XDECREF(errorHandler);
    3789           0 :     Py_XDECREF(exc);
    3790           0 :     return res;
    3791             : 
    3792             :   overflow:
    3793           0 :     PyErr_SetString(PyExc_OverflowError,
    3794             :                     "encoded result is too long for a Python string");
    3795             : 
    3796             :   onError:
    3797           0 :     Py_XDECREF(res);
    3798           0 :     Py_XDECREF(errorHandler);
    3799           0 :     Py_XDECREF(exc);
    3800           0 :     return NULL;
    3801             : }
    3802             : 
    3803           0 : PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
    3804             :                                  Py_ssize_t size,
    3805             :                                  const char *errors)
    3806             : {
    3807           0 :     return unicode_encode_ucs1(p, size, errors, 256);
    3808             : }
    3809             : 
    3810           0 : PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
    3811             : {
    3812           0 :     if (!PyUnicode_Check(unicode)) {
    3813           0 :         PyErr_BadArgument();
    3814           0 :         return NULL;
    3815             :     }
    3816           0 :     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
    3817             :                                   PyUnicode_GET_SIZE(unicode),
    3818             :                                   NULL);
    3819             : }
    3820             : 
    3821             : /* --- 7-bit ASCII Codec -------------------------------------------------- */
    3822             : 
    3823           0 : PyObject *PyUnicode_DecodeASCII(const char *s,
    3824             :                                 Py_ssize_t size,
    3825             :                                 const char *errors)
    3826             : {
    3827           0 :     const char *starts = s;
    3828             :     PyUnicodeObject *v;
    3829             :     Py_UNICODE *p;
    3830             :     Py_ssize_t startinpos;
    3831             :     Py_ssize_t endinpos;
    3832             :     Py_ssize_t outpos;
    3833             :     const char *e;
    3834           0 :     PyObject *errorHandler = NULL;
    3835           0 :     PyObject *exc = NULL;
    3836             : 
    3837             :     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
    3838           0 :     if (size == 1 && *(unsigned char*)s < 128) {
    3839           0 :         Py_UNICODE r = *(unsigned char*)s;
    3840           0 :         return PyUnicode_FromUnicode(&r, 1);
    3841             :     }
    3842             : 
    3843           0 :     v = _PyUnicode_New(size);
    3844           0 :     if (v == NULL)
    3845           0 :         goto onError;
    3846           0 :     if (size == 0)
    3847           0 :         return (PyObject *)v;
    3848           0 :     p = PyUnicode_AS_UNICODE(v);
    3849           0 :     e = s + size;
    3850           0 :     while (s < e) {
    3851           0 :         register unsigned char c = (unsigned char)*s;
    3852           0 :         if (c < 128) {
    3853           0 :             *p++ = c;
    3854           0 :             ++s;
    3855             :         }
    3856             :         else {
    3857           0 :             startinpos = s-starts;
    3858           0 :             endinpos = startinpos + 1;
    3859           0 :             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
    3860           0 :             if (unicode_decode_call_errorhandler(
    3861             :                     errors, &errorHandler,
    3862             :                     "ascii", "ordinal not in range(128)",
    3863             :                     starts, size, &startinpos, &endinpos, &exc, &s,
    3864             :                     &v, &outpos, &p))
    3865           0 :                 goto onError;
    3866             :         }
    3867             :     }
    3868           0 :     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
    3869           0 :         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
    3870           0 :             goto onError;
    3871           0 :     Py_XDECREF(errorHandler);
    3872           0 :     Py_XDECREF(exc);
    3873           0 :     return (PyObject *)v;
    3874             : 
    3875             :   onError:
    3876           0 :     Py_XDECREF(v);
    3877           0 :     Py_XDECREF(errorHandler);
    3878           0 :     Py_XDECREF(exc);
    3879           0 :     return NULL;
    3880             : }
    3881             : 
    3882           0 : PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
    3883             :                                 Py_ssize_t size,
    3884             :                                 const char *errors)
    3885             : {
    3886           0 :     return unicode_encode_ucs1(p, size, errors, 128);
    3887             : }
    3888             : 
    3889           0 : PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
    3890             : {
    3891           0 :     if (!PyUnicode_Check(unicode)) {
    3892           0 :         PyErr_BadArgument();
    3893           0 :         return NULL;
    3894             :     }
    3895           0 :     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
    3896             :                                  PyUnicode_GET_SIZE(unicode),
    3897             :                                  NULL);
    3898             : }
    3899             : 
    3900             : #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
    3901             : 
    3902             : /* --- MBCS codecs for Windows -------------------------------------------- */
    3903             : 
    3904             : #if SIZEOF_INT < SIZEOF_SIZE_T
    3905             : #define NEED_RETRY
    3906             : #endif
    3907             : 
    3908             : /* XXX This code is limited to "true" double-byte encodings, as
    3909             :    a) it assumes an incomplete character consists of a single byte, and
    3910             :    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
    3911             :    encodings, see IsDBCSLeadByteEx documentation. */
    3912             : 
    3913             : static int is_dbcs_lead_byte(const char *s, int offset)
    3914             : {
    3915             :     const char *curr = s + offset;
    3916             : 
    3917             :     if (IsDBCSLeadByte(*curr)) {
    3918             :         const char *prev = CharPrev(s, curr);
    3919             :         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
    3920             :     }
    3921             :     return 0;
    3922             : }
    3923             : 
    3924             : /*
    3925             :  * Decode MBCS string into unicode object. If 'final' is set, converts
    3926             :  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
    3927             :  */
    3928             : static int decode_mbcs(PyUnicodeObject **v,
    3929             :                        const char *s, /* MBCS string */
    3930             :                        int size, /* sizeof MBCS string */
    3931             :                        int final)
    3932             : {
    3933             :     Py_UNICODE *p;
    3934             :     Py_ssize_t n = 0;
    3935             :     int usize = 0;
    3936             : 
    3937             :     assert(size >= 0);
    3938             : 
    3939             :     /* Skip trailing lead-byte unless 'final' is set */
    3940             :     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
    3941             :         --size;
    3942             : 
    3943             :     /* First get the size of the result */
    3944             :     if (size > 0) {
    3945             :         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
    3946             :         if (usize == 0) {
    3947             :             PyErr_SetFromWindowsErrWithFilename(0, NULL);
    3948             :             return -1;
    3949             :         }
    3950             :     }
    3951             : 
    3952             :     if (*v == NULL) {
    3953             :         /* Create unicode object */
    3954             :         *v = _PyUnicode_New(usize);
    3955             :         if (*v == NULL)
    3956             :             return -1;
    3957             :     }
    3958             :     else {
    3959             :         /* Extend unicode object */
    3960             :         n = PyUnicode_GET_SIZE(*v);
    3961             :         if (_PyUnicode_Resize(v, n + usize) < 0)
    3962             :             return -1;
    3963             :     }
    3964             : 
    3965             :     /* Do the conversion */
    3966             :     if (size > 0) {
    3967             :         p = PyUnicode_AS_UNICODE(*v) + n;
    3968             :         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
    3969             :             PyErr_SetFromWindowsErrWithFilename(0, NULL);
    3970             :             return -1;
    3971             :         }
    3972             :     }
    3973             : 
    3974             :     return size;
    3975             : }
    3976             : 
    3977             : PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
    3978             :                                        Py_ssize_t size,
    3979             :                                        const char *errors,
    3980             :                                        Py_ssize_t *consumed)
    3981             : {
    3982             :     PyUnicodeObject *v = NULL;
    3983             :     int done;
    3984             : 
    3985             :     if (consumed)
    3986             :         *consumed = 0;
    3987             : 
    3988             : #ifdef NEED_RETRY
    3989             :   retry:
    3990             :     if (size > INT_MAX)
    3991             :         done = decode_mbcs(&v, s, INT_MAX, 0);
    3992             :     else
    3993             : #endif
    3994             :         done = decode_mbcs(&v, s, (int)size, !consumed);
    3995             : 
    3996             :     if (done < 0) {
    3997             :         Py_XDECREF(v);
    3998             :         return NULL;
    3999             :     }
    4000             : 
    4001             :     if (consumed)
    4002             :         *consumed += done;
    4003             : 
    4004             : #ifdef NEED_RETRY
    4005             :     if (size > INT_MAX) {
    4006             :         s += done;
    4007             :         size -= done;
    4008             :         goto retry;
    4009             :     }
    4010             : #endif
    4011             : 
    4012             :     return (PyObject *)v;
    4013             : }
    4014             : 
    4015             : PyObject *PyUnicode_DecodeMBCS(const char *s,
    4016             :                                Py_ssize_t size,
    4017             :                                const char *errors)
    4018             : {
    4019             :     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
    4020             : }
    4021             : 
    4022             : /*
    4023             :  * Convert unicode into string object (MBCS).
    4024             :  * Returns 0 if succeed, -1 otherwise.
    4025             :  */
    4026             : static int encode_mbcs(PyObject **repr,
    4027             :                        const Py_UNICODE *p, /* unicode */
    4028             :                        int size) /* size of unicode */
    4029             : {
    4030             :     int mbcssize = 0;
    4031             :     Py_ssize_t n = 0;
    4032             : 
    4033             :     assert(size >= 0);
    4034             : 
    4035             :     /* First get the size of the result */
    4036             :     if (size > 0) {
    4037             :         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
    4038             :         if (mbcssize == 0) {
    4039             :             PyErr_SetFromWindowsErrWithFilename(0, NULL);
    4040             :             return -1;
    4041             :         }
    4042             :     }
    4043             : 
    4044             :     if (*repr == NULL) {
    4045             :         /* Create string object */
    4046             :         *repr = PyString_FromStringAndSize(NULL, mbcssize);
    4047             :         if (*repr == NULL)
    4048             :             return -1;
    4049             :     }
    4050             :     else {
    4051             :         /* Extend string object */
    4052             :         n = PyString_Size(*repr);
    4053             :         if (_PyString_Resize(repr, n + mbcssize) < 0)
    4054             :             return -1;
    4055             :     }
    4056             : 
    4057             :     /* Do the conversion */
    4058             :     if (size > 0) {
    4059             :         char *s = PyString_AS_STRING(*repr) + n;
    4060             :         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
    4061             :             PyErr_SetFromWindowsErrWithFilename(0, NULL);
    4062             :             return -1;
    4063             :         }
    4064             :     }
    4065             : 
    4066             :     return 0;
    4067             : }
    4068             : 
    4069             : PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
    4070             :                                Py_ssize_t size,
    4071             :                                const char *errors)
    4072             : {
    4073             :     PyObject *repr = NULL;
    4074             :     int ret;
    4075             : 
    4076             : #ifdef NEED_RETRY
    4077             :   retry:
    4078             :     if (size > INT_MAX)
    4079             :         ret = encode_mbcs(&repr, p, INT_MAX);
    4080             :     else
    4081             : #endif
    4082             :         ret = encode_mbcs(&repr, p, (int)size);
    4083             : 
    4084             :     if (ret < 0) {
    4085             :         Py_XDECREF(repr);
    4086             :         return NULL;
    4087             :     }
    4088             : 
    4089             : #ifdef NEED_RETRY
    4090             :     if (size > INT_MAX) {
    4091             :         p += INT_MAX;
    4092             :         size -= INT_MAX;
    4093             :         goto retry;
    4094             :     }
    4095             : #endif
    4096             : 
    4097             :     return repr;
    4098             : }
    4099             : 
    4100             : PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
    4101             : {
    4102             :     if (!PyUnicode_Check(unicode)) {
    4103             :         PyErr_BadArgument();
    4104             :         return NULL;
    4105             :     }
    4106             :     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
    4107             :                                 PyUnicode_GET_SIZE(unicode),
    4108             :                                 NULL);
    4109             : }
    4110             : 
    4111             : #undef NEED_RETRY
    4112             : 
    4113             : #endif /* MS_WINDOWS */
    4114             : 
    4115             : /* --- Character Mapping Codec -------------------------------------------- */
    4116             : 
    4117           0 : PyObject *PyUnicode_DecodeCharmap(const char *s,
    4118             :                                   Py_ssize_t size,
    4119             :                                   PyObject *mapping,
    4120             :                                   const char *errors)
    4121             : {
    4122           0 :     const char *starts = s;
    4123             :     Py_ssize_t startinpos;
    4124             :     Py_ssize_t endinpos;
    4125             :     Py_ssize_t outpos;
    4126             :     const char *e;
    4127             :     PyUnicodeObject *v;
    4128             :     Py_UNICODE *p;
    4129           0 :     Py_ssize_t extrachars = 0;
    4130           0 :     PyObject *errorHandler = NULL;
    4131           0 :     PyObject *exc = NULL;
    4132           0 :     Py_UNICODE *mapstring = NULL;
    4133           0 :     Py_ssize_t maplen = 0;
    4134             : 
    4135             :     /* Default to Latin-1 */
    4136           0 :     if (mapping == NULL)
    4137           0 :         return PyUnicode_DecodeLatin1(s, size, errors);
    4138             : 
    4139           0 :     v = _PyUnicode_New(size);
    4140           0 :     if (v == NULL)
    4141           0 :         goto onError;
    4142           0 :     if (size == 0)
    4143           0 :         return (PyObject *)v;
    4144           0 :     p = PyUnicode_AS_UNICODE(v);
    4145           0 :     e = s + size;
    4146           0 :     if (PyUnicode_CheckExact(mapping)) {
    4147           0 :         mapstring = PyUnicode_AS_UNICODE(mapping);
    4148           0 :         maplen = PyUnicode_GET_SIZE(mapping);
    4149           0 :         while (s < e) {
    4150           0 :             unsigned char ch = *s;
    4151           0 :             Py_UNICODE x = 0xfffe; /* illegal value */
    4152             : 
    4153           0 :             if (ch < maplen)
    4154           0 :                 x = mapstring[ch];
    4155             : 
    4156           0 :             if (x == 0xfffe) {
    4157             :                 /* undefined mapping */
    4158           0 :                 outpos = p-PyUnicode_AS_UNICODE(v);
    4159           0 :                 startinpos = s-starts;
    4160           0 :                 endinpos = startinpos+1;
    4161           0 :                 if (unicode_decode_call_errorhandler(
    4162             :                         errors, &errorHandler,
    4163             :                         "charmap", "character maps to <undefined>",
    4164             :                         starts, size, &startinpos, &endinpos, &exc, &s,
    4165             :                         &v, &outpos, &p)) {
    4166           0 :                     goto onError;
    4167             :                 }
    4168           0 :                 continue;
    4169             :             }
    4170           0 :             *p++ = x;
    4171           0 :             ++s;
    4172             :         }
    4173             :     }
    4174             :     else {
    4175           0 :         while (s < e) {
    4176           0 :             unsigned char ch = *s;
    4177             :             PyObject *w, *x;
    4178             : 
    4179             :             /* Get mapping (char ordinal -> integer, Unicode char or None) */
    4180           0 :             w = PyInt_FromLong((long)ch);
    4181           0 :             if (w == NULL)
    4182           0 :                 goto onError;
    4183           0 :             x = PyObject_GetItem(mapping, w);
    4184           0 :             Py_DECREF(w);
    4185           0 :             if (x == NULL) {
    4186           0 :                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
    4187             :                     /* No mapping found means: mapping is undefined. */
    4188           0 :                     PyErr_Clear();
    4189           0 :                     goto Undefined;
    4190             :                 } else
    4191           0 :                     goto onError;
    4192             :             }
    4193             : 
    4194             :             /* Apply mapping */
    4195           0 :             if (x == Py_None)
    4196           0 :                 goto Undefined;
    4197           0 :             if (PyInt_Check(x)) {
    4198           0 :                 long value = PyInt_AS_LONG(x);
    4199           0 :                 if (value == 0xFFFE)
    4200           0 :                     goto Undefined;
    4201           0 :                 if (value < 0 || value > 0x10FFFF) {
    4202           0 :                     PyErr_SetString(PyExc_TypeError,
    4203             :                                     "character mapping must be in range(0x110000)");
    4204           0 :                     Py_DECREF(x);
    4205           0 :                     goto onError;
    4206             :                 }
    4207             : 
    4208             : #ifndef Py_UNICODE_WIDE
    4209           0 :                 if (value > 0xFFFF) {
    4210             :                     /* see the code for 1-n mapping below */
    4211           0 :                     if (extrachars < 2) {
    4212             :                         /* resize first */
    4213           0 :                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
    4214           0 :                         Py_ssize_t needed = 10 - extrachars;
    4215           0 :                         extrachars += needed;
    4216             :                         /* XXX overflow detection missing */
    4217           0 :                         if (_PyUnicode_Resize(&v,
    4218           0 :                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
    4219           0 :                             Py_DECREF(x);
    4220           0 :                             goto onError;
    4221             :                         }
    4222           0 :                         p = PyUnicode_AS_UNICODE(v) + oldpos;
    4223             :                     }
    4224           0 :                     value -= 0x10000;
    4225           0 :                     *p++ = 0xD800 | (value >> 10);
    4226           0 :                     *p++ = 0xDC00 | (value & 0x3FF);
    4227           0 :                     extrachars -= 2;
    4228             :                 }
    4229             :                 else
    4230             : #endif
    4231           0 :                 *p++ = (Py_UNICODE)value;
    4232             :             }
    4233           0 :             else if (PyUnicode_Check(x)) {
    4234           0 :                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
    4235             : 
    4236           0 :                 if (targetsize == 1) {
    4237             :                     /* 1-1 mapping */
    4238           0 :                     Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
    4239           0 :                     if (value == 0xFFFE)
    4240           0 :                         goto Undefined;
    4241           0 :                     *p++ = value;
    4242             :                 }
    4243           0 :                 else if (targetsize > 1) {
    4244             :                     /* 1-n mapping */
    4245           0 :                     if (targetsize > extrachars) {
    4246             :                         /* resize first */
    4247           0 :                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
    4248           0 :                         Py_ssize_t needed = (targetsize - extrachars) + \
    4249           0 :                             (targetsize << 2);
    4250           0 :                         extrachars += needed;
    4251             :                         /* XXX overflow detection missing */
    4252           0 :                         if (_PyUnicode_Resize(&v,
    4253           0 :                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
    4254           0 :                             Py_DECREF(x);
    4255           0 :                             goto onError;
    4256             :                         }
    4257           0 :                         p = PyUnicode_AS_UNICODE(v) + oldpos;
    4258             :                     }
    4259           0 :                     Py_UNICODE_COPY(p,
    4260             :                                     PyUnicode_AS_UNICODE(x),
    4261             :                                     targetsize);
    4262           0 :                     p += targetsize;
    4263           0 :                     extrachars -= targetsize;
    4264             :                 }
    4265             :                 /* 1-0 mapping: skip the character */
    4266             :             }
    4267             :             else {
    4268             :                 /* wrong return value */
    4269           0 :                 PyErr_SetString(PyExc_TypeError,
    4270             :                                 "character mapping must return integer, None or unicode");
    4271           0 :                 Py_DECREF(x);
    4272           0 :                 goto onError;
    4273             :             }
    4274           0 :             Py_DECREF(x);
    4275           0 :             ++s;
    4276           0 :             continue;
    4277             : Undefined:
    4278             :             /* undefined mapping */
    4279           0 :             Py_XDECREF(x);
    4280           0 :             outpos = p-PyUnicode_AS_UNICODE(v);
    4281           0 :             startinpos = s-starts;
    4282           0 :             endinpos = startinpos+1;
    4283           0 :             if (unicode_decode_call_errorhandler(
    4284             :                     errors, &errorHandler,
    4285             :                     "charmap", "character maps to <undefined>",
    4286             :                     starts, size, &startinpos, &endinpos, &exc, &s,
    4287             :                     &v, &outpos, &p)) {
    4288           0 :                 goto onError;
    4289             :             }
    4290             :         }
    4291             :     }
    4292           0 :     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
    4293           0 :         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
    4294           0 :             goto onError;
    4295           0 :     Py_XDECREF(errorHandler);
    4296           0 :     Py_XDECREF(exc);
    4297           0 :     return (PyObject *)v;
    4298             : 
    4299             :   onError:
    4300           0 :     Py_XDECREF(errorHandler);
    4301           0 :     Py_XDECREF(exc);
    4302           0 :     Py_XDECREF(v);
    4303           0 :     return NULL;
    4304             : }
    4305             : 
    4306             : /* Charmap encoding: the lookup table */
    4307             : 
    4308             : struct encoding_map{
    4309             :     PyObject_HEAD
    4310             :     unsigned char level1[32];
    4311             :     int count2, count3;
    4312             :     unsigned char level23[1];
    4313             : };
    4314             : 
    4315             : static PyObject*
    4316           0 : encoding_map_size(PyObject *obj, PyObject* args)
    4317             : {
    4318           0 :     struct encoding_map *map = (struct encoding_map*)obj;
    4319           0 :     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
    4320           0 :                           128*map->count3);
    4321             : }
    4322             : 
    4323             : static PyMethodDef encoding_map_methods[] = {
    4324             :     {"size", encoding_map_size, METH_NOARGS,
    4325             :      PyDoc_STR("Return the size (in bytes) of this object") },
    4326             :     { 0 }
    4327             : };
    4328             : 
    4329             : static void
    4330           0 : encoding_map_dealloc(PyObject* o)
    4331             : {
    4332           0 :     PyObject_FREE(o);
    4333           0 : }
    4334             : 
    4335             : static PyTypeObject EncodingMapType = {
    4336             :     PyVarObject_HEAD_INIT(NULL, 0)
    4337             :     "EncodingMap",          /*tp_name*/
    4338             :     sizeof(struct encoding_map),   /*tp_basicsize*/
    4339             :     0,                      /*tp_itemsize*/
    4340             :     /* methods */
    4341             :     encoding_map_dealloc,   /*tp_dealloc*/
    4342             :     0,                      /*tp_print*/
    4343             :     0,                      /*tp_getattr*/
    4344             :     0,                      /*tp_setattr*/
    4345             :     0,                      /*tp_compare*/
    4346             :     0,                      /*tp_repr*/
    4347             :     0,                      /*tp_as_number*/
    4348             :     0,                      /*tp_as_sequence*/
    4349             :     0,                      /*tp_as_mapping*/
    4350             :     0,                      /*tp_hash*/
    4351             :     0,                      /*tp_call*/
    4352             :     0,                      /*tp_str*/
    4353             :     0,                      /*tp_getattro*/
    4354             :     0,                      /*tp_setattro*/
    4355             :     0,                      /*tp_as_buffer*/
    4356             :     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
    4357             :     0,                      /*tp_doc*/
    4358             :     0,                      /*tp_traverse*/
    4359             :     0,                      /*tp_clear*/
    4360             :     0,                      /*tp_richcompare*/
    4361             :     0,                      /*tp_weaklistoffset*/
    4362             :     0,                      /*tp_iter*/
    4363             :     0,                      /*tp_iternext*/
    4364             :     encoding_map_methods,   /*tp_methods*/
    4365             :     0,                      /*tp_members*/
    4366             :     0,                      /*tp_getset*/
    4367             :     0,                      /*tp_base*/
    4368             :     0,                      /*tp_dict*/
    4369             :     0,                      /*tp_descr_get*/
    4370             :     0,                      /*tp_descr_set*/
    4371             :     0,                      /*tp_dictoffset*/
    4372             :     0,                      /*tp_init*/
    4373             :     0,                      /*tp_alloc*/
    4374             :     0,                      /*tp_new*/
    4375             :     0,                      /*tp_free*/
    4376             :     0,                      /*tp_is_gc*/
    4377             : };
    4378             : 
    4379             : PyObject*
    4380           0 : PyUnicode_BuildEncodingMap(PyObject* string)
    4381             : {
    4382             :     Py_UNICODE *decode;
    4383             :     PyObject *result;
    4384             :     struct encoding_map *mresult;
    4385             :     int i;
    4386           0 :     int need_dict = 0;
    4387             :     unsigned char level1[32];
    4388             :     unsigned char level2[512];
    4389             :     unsigned char *mlevel1, *mlevel2, *mlevel3;
    4390           0 :     int count2 = 0, count3 = 0;
    4391             : 
    4392           0 :     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
    4393           0 :         PyErr_BadArgument();
    4394           0 :         return NULL;
    4395             :     }
    4396           0 :     decode = PyUnicode_AS_UNICODE(string);
    4397           0 :     memset(level1, 0xFF, sizeof level1);
    4398           0 :     memset(level2, 0xFF, sizeof level2);
    4399             : 
    4400             :     /* If there isn't a one-to-one mapping of NULL to \0,
    4401             :        or if there are non-BMP characters, we need to use
    4402             :        a mapping dictionary. */
    4403           0 :     if (decode[0] != 0)
    4404           0 :         need_dict = 1;
    4405           0 :     for (i = 1; i < 256; i++) {
    4406             :         int l1, l2;
    4407           0 :         if (decode[i] == 0
    4408             : #ifdef Py_UNICODE_WIDE
    4409             :             || decode[i] > 0xFFFF
    4410             : #endif
    4411             :             ) {
    4412           0 :             need_dict = 1;
    4413           0 :             break;
    4414             :         }
    4415           0 :         if (decode[i] == 0xFFFE)
    4416             :             /* unmapped character */
    4417           0 :             continue;
    4418           0 :         l1 = decode[i] >> 11;
    4419           0 :         l2 = decode[i] >> 7;
    4420           0 :         if (level1[l1] == 0xFF)
    4421           0 :             level1[l1] = count2++;
    4422           0 :         if (level2[l2] == 0xFF)
    4423           0 :             level2[l2] = count3++;
    4424             :     }
    4425             : 
    4426           0 :     if (count2 >= 0xFF || count3 >= 0xFF)
    4427           0 :         need_dict = 1;
    4428             : 
    4429           0 :     if (need_dict) {
    4430           0 :         PyObject *result = PyDict_New();
    4431             :         PyObject *key, *value;
    4432           0 :         if (!result)
    4433           0 :             return NULL;
    4434           0 :         for (i = 0; i < 256; i++) {
    4435           0 :             value = NULL;
    4436           0 :             key = PyInt_FromLong(decode[i]);
    4437           0 :             value = PyInt_FromLong(i);
    4438           0 :             if (!key || !value)
    4439             :                 goto failed1;
    4440           0 :             if (PyDict_SetItem(result, key, value) == -1)
    4441           0 :                 goto failed1;
    4442           0 :             Py_DECREF(key);
    4443           0 :             Py_DECREF(value);
    4444             :         }
    4445           0 :         return result;
    4446             :       failed1:
    4447           0 :         Py_XDECREF(key);
    4448           0 :         Py_XDECREF(value);
    4449           0 :         Py_DECREF(result);
    4450           0 :         return NULL;
    4451             :     }
    4452             : 
    4453             :     /* Create a three-level trie */
    4454           0 :     result = PyObject_MALLOC(sizeof(struct encoding_map) +
    4455           0 :                              16*count2 + 128*count3 - 1);
    4456           0 :     if (!result)
    4457           0 :         return PyErr_NoMemory();
    4458           0 :     PyObject_Init(result, &EncodingMapType);
    4459           0 :     mresult = (struct encoding_map*)result;
    4460           0 :     mresult->count2 = count2;
    4461           0 :     mresult->count3 = count3;
    4462           0 :     mlevel1 = mresult->level1;
    4463           0 :     mlevel2 = mresult->level23;
    4464           0 :     mlevel3 = mresult->level23 + 16*count2;
    4465           0 :     memcpy(mlevel1, level1, 32);
    4466           0 :     memset(mlevel2, 0xFF, 16*count2);
    4467           0 :     memset(mlevel3, 0, 128*count3);
    4468           0 :     count3 = 0;
    4469           0 :     for (i = 1; i < 256; i++) {
    4470             :         int o1, o2, o3, i2, i3;
    4471           0 :         if (decode[i] == 0xFFFE)
    4472             :             /* unmapped character */
    4473           0 :             continue;
    4474           0 :         o1 = decode[i]>>11;
    4475           0 :         o2 = (decode[i]>>7) & 0xF;
    4476           0 :         i2 = 16*mlevel1[o1] + o2;
    4477           0 :         if (mlevel2[i2] == 0xFF)
    4478           0 :             mlevel2[i2] = count3++;
    4479           0 :         o3 = decode[i] & 0x7F;
    4480           0 :         i3 = 128*mlevel2[i2] + o3;
    4481           0 :         mlevel3[i3] = i;
    4482             :     }
    4483           0 :     return result;
    4484             : }
    4485             : 
    4486             : static int
    4487           0 : encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
    4488             : {
    4489           0 :     struct encoding_map *map = (struct encoding_map*)mapping;
    4490           0 :     int l1 = c>>11;
    4491           0 :     int l2 = (c>>7) & 0xF;
    4492           0 :     int l3 = c & 0x7F;
    4493             :     int i;
    4494             : 
    4495             : #ifdef Py_UNICODE_WIDE
    4496             :     if (c > 0xFFFF) {
    4497             :         return -1;
    4498             :     }
    4499             : #endif
    4500           0 :     if (c == 0)
    4501           0 :         return 0;
    4502             :     /* level 1*/
    4503           0 :     i = map->level1[l1];
    4504           0 :     if (i == 0xFF) {
    4505           0 :         return -1;
    4506             :     }
    4507             :     /* level 2*/
    4508           0 :     i = map->level23[16*i+l2];
    4509           0 :     if (i == 0xFF) {
    4510           0 :         return -1;
    4511             :     }
    4512             :     /* level 3 */
    4513           0 :     i = map->level23[16*map->count2 + 128*i + l3];
    4514           0 :     if (i == 0) {
    4515           0 :         return -1;
    4516             :     }
    4517           0 :     return i;
    4518             : }
    4519             : 
    4520             : /* Lookup the character ch in the mapping. If the character
    4521             :    can't be found, Py_None is returned (or NULL, if another
    4522             :    error occurred). */
    4523           0 : static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
    4524             : {
    4525           0 :     PyObject *w = PyInt_FromLong((long)c);
    4526             :     PyObject *x;
    4527             : 
    4528           0 :     if (w == NULL)
    4529           0 :         return NULL;
    4530           0 :     x = PyObject_GetItem(mapping, w);
    4531           0 :     Py_DECREF(w);
    4532           0 :     if (x == NULL) {
    4533           0 :         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
    4534             :             /* No mapping found means: mapping is undefined. */
    4535           0 :             PyErr_Clear();
    4536           0 :             x = Py_None;
    4537           0 :             Py_INCREF(x);
    4538           0 :             return x;
    4539             :         } else
    4540           0 :             return NULL;
    4541             :     }
    4542           0 :     else if (x == Py_None)
    4543           0 :         return x;
    4544           0 :     else if (PyInt_Check(x)) {
    4545           0 :         long value = PyInt_AS_LONG(x);
    4546           0 :         if (value < 0 || value > 255) {
    4547           0 :             PyErr_SetString(PyExc_TypeError,
    4548             :                             "character mapping must be in range(256)");
    4549           0 :             Py_DECREF(x);
    4550           0 :             return NULL;
    4551             :         }
    4552           0 :         return x;
    4553             :     }
    4554           0 :     else if (PyString_Check(x))
    4555           0 :         return x;
    4556             :     else {
    4557             :         /* wrong return value */
    4558           0 :         PyErr_SetString(PyExc_TypeError,
    4559             :                         "character mapping must return integer, None or str");
    4560           0 :         Py_DECREF(x);
    4561           0 :         return NULL;
    4562             :     }
    4563             : }
    4564             : 
    4565             : static int
    4566           0 : charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
    4567             : {
    4568           0 :     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
    4569             :     /* exponentially overallocate to minimize reallocations */
    4570           0 :     if (requiredsize < 2*outsize)
    4571           0 :         requiredsize = 2*outsize;
    4572           0 :     if (_PyString_Resize(outobj, requiredsize)) {
    4573           0 :         return 0;
    4574             :     }
    4575           0 :     return 1;
    4576             : }
    4577             : 
    4578             : typedef enum charmapencode_result {
    4579             :     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
    4580             : }charmapencode_result;
    4581             : /* lookup the character, put the result in the output string and adjust
    4582             :    various state variables. Reallocate the output string if not enough
    4583             :    space is available. Return a new reference to the object that
    4584             :    was put in the output buffer, or Py_None, if the mapping was undefined
    4585             :    (in which case no character was written) or NULL, if a
    4586             :    reallocation error occurred. The caller must decref the result */
    4587             : static
    4588           0 : charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
    4589             :                                           PyObject **outobj, Py_ssize_t *outpos)
    4590             : {
    4591             :     PyObject *rep;
    4592             :     char *outstart;
    4593           0 :     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
    4594             : 
    4595           0 :     if (Py_TYPE(mapping) == &EncodingMapType) {
    4596           0 :         int res = encoding_map_lookup(c, mapping);
    4597           0 :         Py_ssize_t requiredsize = *outpos+1;
    4598           0 :         if (res == -1)
    4599           0 :             return enc_FAILED;
    4600           0 :         if (outsize<requiredsize)
    4601           0 :             if (!charmapencode_resize(outobj, outpos, requiredsize))
    4602           0 :                 return enc_EXCEPTION;
    4603           0 :         outstart = PyString_AS_STRING(*outobj);
    4604           0 :         outstart[(*outpos)++] = (char)res;
    4605           0 :         return enc_SUCCESS;
    4606             :     }
    4607             : 
    4608           0 :     rep = charmapencode_lookup(c, mapping);
    4609           0 :     if (rep==NULL)
    4610           0 :         return enc_EXCEPTION;
    4611           0 :     else if (rep==Py_None) {
    4612           0 :         Py_DECREF(rep);
    4613           0 :         return enc_FAILED;
    4614             :     } else {
    4615           0 :         if (PyInt_Check(rep)) {
    4616           0 :             Py_ssize_t requiredsize = *outpos+1;
    4617           0 :             if (outsize<requiredsize)
    4618           0 :                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
    4619           0 :                     Py_DECREF(rep);
    4620           0 :                     return enc_EXCEPTION;
    4621             :                 }
    4622           0 :             outstart = PyString_AS_STRING(*outobj);
    4623           0 :             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
    4624             :         }
    4625             :         else {
    4626           0 :             const char *repchars = PyString_AS_STRING(rep);
    4627           0 :             Py_ssize_t repsize = PyString_GET_SIZE(rep);
    4628           0 :             Py_ssize_t requiredsize = *outpos+repsize;
    4629           0 :             if (outsize<requiredsize)
    4630           0 :                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
    4631           0 :                     Py_DECREF(rep);
    4632           0 :                     return enc_EXCEPTION;
    4633             :                 }
    4634           0 :             outstart = PyString_AS_STRING(*outobj);
    4635           0 :             memcpy(outstart + *outpos, repchars, repsize);
    4636           0 :             *outpos += repsize;
    4637             :         }
    4638             :     }
    4639           0 :     Py_DECREF(rep);
    4640           0 :     return enc_SUCCESS;
    4641             : }
    4642             : 
    4643             : /* handle an error in PyUnicode_EncodeCharmap
    4644             :    Return 0 on success, -1 on error */
    4645             : static
    4646           0 : int charmap_encoding_error(
    4647             :     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
    4648             :     PyObject **exceptionObject,
    4649             :     int *known_errorHandler, PyObject **errorHandler, const char *errors,
    4650             :     PyObject **res, Py_ssize_t *respos)
    4651             : {
    4652           0 :     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
    4653             :     Py_ssize_t repsize;
    4654             :     Py_ssize_t newpos;
    4655             :     Py_UNICODE *uni2;
    4656             :     /* startpos for collecting unencodable chars */
    4657           0 :     Py_ssize_t collstartpos = *inpos;
    4658           0 :     Py_ssize_t collendpos = *inpos+1;
    4659             :     Py_ssize_t collpos;
    4660           0 :     char *encoding = "charmap";
    4661           0 :     char *reason = "character maps to <undefined>";
    4662             :     charmapencode_result x;
    4663             : 
    4664             :     /* find all unencodable characters */
    4665           0 :     while (collendpos < size) {
    4666             :         PyObject *rep;
    4667           0 :         if (Py_TYPE(mapping) == &EncodingMapType) {
    4668           0 :             int res = encoding_map_lookup(p[collendpos], mapping);
    4669           0 :             if (res != -1)
    4670           0 :                 break;
    4671           0 :             ++collendpos;
    4672           0 :             continue;
    4673             :         }
    4674             : 
    4675           0 :         rep = charmapencode_lookup(p[collendpos], mapping);
    4676           0 :         if (rep==NULL)
    4677           0 :             return -1;
    4678           0 :         else if (rep!=Py_None) {
    4679           0 :             Py_DECREF(rep);
    4680           0 :             break;
    4681             :         }
    4682           0 :         Py_DECREF(rep);
    4683           0 :         ++collendpos;
    4684             :     }
    4685             :     /* cache callback name lookup
    4686             :      * (if not done yet, i.e. it's the first error) */
    4687           0 :     if (*known_errorHandler==-1) {
    4688           0 :         if ((errors==NULL) || (!strcmp(errors, "strict")))
    4689           0 :             *known_errorHandler = 1;
    4690           0 :         else if (!strcmp(errors, "replace"))
    4691           0 :             *known_errorHandler = 2;
    4692           0 :         else if (!strcmp(errors, "ignore"))
    4693           0 :             *known_errorHandler = 3;
    4694           0 :         else if (!strcmp(errors, "xmlcharrefreplace"))
    4695           0 :             *known_errorHandler = 4;
    4696             :         else
    4697           0 :             *known_errorHandler = 0;
    4698             :     }
    4699           0 :     switch (*known_errorHandler) {
    4700             :     case 1: /* strict */
    4701           0 :         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
    4702           0 :         return -1;
    4703             :     case 2: /* replace */
    4704           0 :         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
    4705           0 :             x = charmapencode_output('?', mapping, res, respos);
    4706           0 :             if (x==enc_EXCEPTION) {
    4707           0 :                 return -1;
    4708             :             }
    4709           0 :             else if (x==enc_FAILED) {
    4710           0 :                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
    4711           0 :                 return -1;
    4712             :             }
    4713             :         }
    4714             :         /* fall through */
    4715             :     case 3: /* ignore */
    4716           0 :         *inpos = collendpos;
    4717           0 :         break;
    4718             :     case 4: /* xmlcharrefreplace */
    4719             :         /* generate replacement */
    4720           0 :         for (collpos = collstartpos; collpos < collendpos;) {
    4721             :             char buffer[2+29+1+1];
    4722             :             char *cp;
    4723           0 :             Py_UCS4 ch = p[collpos++];
    4724             : #ifndef Py_UNICODE_WIDE
    4725           0 :             if ((0xD800 <= ch && ch <= 0xDBFF) &&
    4726           0 :                 (collpos < collendpos) &&
    4727           0 :                 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
    4728           0 :                 ch = ((((ch & 0x03FF) << 10) |
    4729           0 :                        ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
    4730             :             }
    4731             : #endif
    4732           0 :             sprintf(buffer, "&#%d;", (int)ch);
    4733           0 :             for (cp = buffer; *cp; ++cp) {
    4734           0 :                 x = charmapencode_output(*cp, mapping, res, respos);
    4735           0 :                 if (x==enc_EXCEPTION)
    4736           0 :                     return -1;
    4737           0 :                 else if (x==enc_FAILED) {
    4738           0 :                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
    4739           0 :                     return -1;
    4740             :                 }
    4741             :             }
    4742             :         }
    4743           0 :         *inpos = collendpos;
    4744           0 :         break;
    4745             :     default:
    4746           0 :         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
    4747             :                                                       encoding, reason, p, size, exceptionObject,
    4748             :                                                       collstartpos, collendpos, &newpos);
    4749           0 :         if (repunicode == NULL)
    4750           0 :             return -1;
    4751             :         /* generate replacement  */
    4752           0 :         repsize = PyUnicode_GET_SIZE(repunicode);
    4753           0 :         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
    4754           0 :             x = charmapencode_output(*uni2, mapping, res, respos);
    4755           0 :             if (x==enc_EXCEPTION) {
    4756           0 :                 return -1;
    4757             :             }
    4758           0 :             else if (x==enc_FAILED) {
    4759           0 :                 Py_DECREF(repunicode);
    4760           0 :                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
    4761           0 :                 return -1;
    4762             :             }
    4763             :         }
    4764           0 :         *inpos = newpos;
    4765           0 :         Py_DECREF(repunicode);
    4766             :     }
    4767           0 :     return 0;
    4768             : }
    4769             : 
    4770           0 : PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
    4771             :                                   Py_ssize_t size,
    4772             :                                   PyObject *mapping,
    4773             :                                   const char *errors)
    4774             : {
    4775             :     /* output object */
    4776           0 :     PyObject *res = NULL;
    4777             :     /* current input position */
    4778           0 :     Py_ssize_t inpos = 0;
    4779             :     /* current output position */
    4780           0 :     Py_ssize_t respos = 0;
    4781           0 :     PyObject *errorHandler = NULL;
    4782           0 :     PyObject *exc = NULL;
    4783             :     /* the following variable is used for caching string comparisons
    4784             :      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
    4785             :      * 3=ignore, 4=xmlcharrefreplace */
    4786           0 :     int known_errorHandler = -1;
    4787             : 
    4788             :     /* Default to Latin-1 */
    4789           0 :     if (mapping == NULL)
    4790           0 :         return PyUnicode_EncodeLatin1(p, size, errors);
    4791             : 
    4792             :     /* allocate enough for a simple encoding without
    4793             :        replacements, if we need more, we'll resize */
    4794           0 :     res = PyString_FromStringAndSize(NULL, size);
    4795           0 :     if (res == NULL)
    4796           0 :         goto onError;
    4797           0 :     if (size == 0)
    4798           0 :         return res;
    4799             : 
    4800           0 :     while (inpos<size) {
    4801             :         /* try to encode it */
    4802           0 :         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
    4803           0 :         if (x==enc_EXCEPTION) /* error */
    4804           0 :             goto onError;
    4805           0 :         if (x==enc_FAILED) { /* unencodable character */
    4806           0 :             if (charmap_encoding_error(p, size, &inpos, mapping,
    4807             :                                        &exc,
    4808             :                                        &known_errorHandler, &errorHandler, errors,
    4809             :                                        &res, &respos)) {
    4810           0 :                 goto onError;
    4811             :             }
    4812             :         }
    4813             :         else
    4814             :             /* done with this character => adjust input position */
    4815           0 :             ++inpos;
    4816             :     }
    4817             : 
    4818             :     /* Resize if we allocated to much */
    4819           0 :     if (respos<PyString_GET_SIZE(res)) {
    4820           0 :         if (_PyString_Resize(&res, respos))
    4821           0 :             goto onError;
    4822             :     }
    4823           0 :     Py_XDECREF(exc);
    4824           0 :     Py_XDECREF(errorHandler);
    4825           0 :     return res;
    4826             : 
    4827             :   onError:
    4828           0 :     Py_XDECREF(res);
    4829           0 :     Py_XDECREF(exc);
    4830           0 :     Py_XDECREF(errorHandler);
    4831           0 :     return NULL;
    4832             : }
    4833             : 
    4834           0 : PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
    4835             :                                     PyObject *mapping)
    4836             : {
    4837           0 :     if (!PyUnicode_Check(unicode) || mapping == NULL) {
    4838           0 :         PyErr_BadArgument();
    4839           0 :         return NULL;
    4840             :     }
    4841           0 :     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
    4842             :                                    PyUnicode_GET_SIZE(unicode),
    4843             :                                    mapping,
    4844             :                                    NULL);
    4845             : }
    4846             : 
    4847             : /* create or adjust a UnicodeTranslateError */
    4848           0 : static void make_translate_exception(PyObject **exceptionObject,
    4849             :                                      const Py_UNICODE *unicode, Py_ssize_t size,
    4850             :                                      Py_ssize_t startpos, Py_ssize_t endpos,
    4851             :                                      const char *reason)
    4852             : {
    4853           0 :     if (*exceptionObject == NULL) {
    4854           0 :         *exceptionObject = PyUnicodeTranslateError_Create(
    4855             :             unicode, size, startpos, endpos, reason);
    4856             :     }
    4857             :     else {
    4858           0 :         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
    4859           0 :             goto onError;
    4860           0 :         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
    4861           0 :             goto onError;
    4862           0 :         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
    4863           0 :             goto onError;
    4864           0 :         return;
    4865             :       onError:
    4866           0 :         Py_CLEAR(*exceptionObject);
    4867             :     }
    4868             : }
    4869             : 
    4870             : /* raises a UnicodeTranslateError */
    4871           0 : static void raise_translate_exception(PyObject **exceptionObject,
    4872             :                                       const Py_UNICODE *unicode, Py_ssize_t size,
    4873             :                                       Py_ssize_t startpos, Py_ssize_t endpos,
    4874             :                                       const char *reason)
    4875             : {
    4876           0 :     make_translate_exception(exceptionObject,
    4877             :                              unicode, size, startpos, endpos, reason);
    4878           0 :     if (*exceptionObject != NULL)
    4879           0 :         PyCodec_StrictErrors(*exceptionObject);
    4880           0 : }
    4881             : 
    4882             : /* error handling callback helper:
    4883             :    build arguments, call the callback and check the arguments,
    4884             :    put the result into newpos and return the replacement string, which
    4885             :    has to be freed by the caller */
    4886           0 : static PyObject *unicode_translate_call_errorhandler(const char *errors,
    4887             :                                                      PyObject **errorHandler,
    4888             :                                                      const char *reason,
    4889             :                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
    4890             :                                                      Py_ssize_t startpos, Py_ssize_t endpos,
    4891             :                                                      Py_ssize_t *newpos)
    4892             : {
    4893             :     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
    4894             : 
    4895             :     Py_ssize_t i_newpos;
    4896             :     PyObject *restuple;
    4897             :     PyObject *resunicode;
    4898             : 
    4899           0 :     if (*errorHandler == NULL) {
    4900           0 :         *errorHandler = PyCodec_LookupError(errors);
    4901           0 :         if (*errorHandler == NULL)
    4902           0 :             return NULL;
    4903             :     }
    4904             : 
    4905           0 :     make_translate_exception(exceptionObject,
    4906             :                              unicode, size, startpos, endpos, reason);
    4907           0 :     if (*exceptionObject == NULL)
    4908           0 :         return NULL;
    4909             : 
    4910           0 :     restuple = PyObject_CallFunctionObjArgs(
    4911             :         *errorHandler, *exceptionObject, NULL);
    4912           0 :     if (restuple == NULL)
    4913           0 :         return NULL;
    4914           0 :     if (!PyTuple_Check(restuple)) {
    4915           0 :         PyErr_SetString(PyExc_TypeError, &argparse[4]);
    4916           0 :         Py_DECREF(restuple);
    4917           0 :         return NULL;
    4918             :     }
    4919           0 :     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
    4920             :                           &resunicode, &i_newpos)) {
    4921           0 :         Py_DECREF(restuple);
    4922           0 :         return NULL;
    4923             :     }
    4924           0 :     if (i_newpos<0)
    4925           0 :         *newpos = size+i_newpos;
    4926             :     else
    4927           0 :         *newpos = i_newpos;
    4928           0 :     if (*newpos<0 || *newpos>size) {
    4929           0 :         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
    4930           0 :         Py_DECREF(restuple);
    4931           0 :         return NULL;
    4932             :     }
    4933           0 :     Py_INCREF(resunicode);
    4934           0 :     Py_DECREF(restuple);
    4935           0 :     return resunicode;
    4936             : }
    4937             : 
    4938             : /* Lookup the character ch in the mapping and put the result in result,
    4939             :    which must be decrefed by the caller.
    4940             :    Return 0 on success, -1 on error */
    4941             : static
    4942           0 : int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
    4943             : {
    4944           0 :     PyObject *w = PyInt_FromLong((long)c);
    4945             :     PyObject *x;
    4946             : 
    4947           0 :     if (w == NULL)
    4948           0 :         return -1;
    4949           0 :     x = PyObject_GetItem(mapping, w);
    4950           0 :     Py_DECREF(w);
    4951           0 :     if (x == NULL) {
    4952           0 :         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
    4953             :             /* No mapping found means: use 1:1 mapping. */
    4954           0 :             PyErr_Clear();
    4955           0 :             *result = NULL;
    4956           0 :             return 0;
    4957             :         } else
    4958           0 :             return -1;
    4959             :     }
    4960           0 :     else if (x == Py_None) {
    4961           0 :         *result = x;
    4962           0 :         return 0;
    4963             :     }
    4964           0 :     else if (PyInt_Check(x)) {
    4965           0 :         long value = PyInt_AS_LONG(x);
    4966           0 :         long max = PyUnicode_GetMax();
    4967           0 :         if (value < 0 || value > max) {
    4968           0 :             PyErr_Format(PyExc_TypeError,
    4969             :                          "character mapping must be in range(0x%lx)", max+1);
    4970           0 :             Py_DECREF(x);
    4971           0 :             return -1;
    4972             :         }
    4973           0 :         *result = x;
    4974           0 :         return 0;
    4975             :     }
    4976           0 :     else if (PyUnicode_Check(x)) {
    4977           0 :         *result = x;
    4978           0 :         return 0;
    4979             :     }
    4980             :     else {
    4981             :         /* wrong return value */
    4982           0 :         PyErr_SetString(PyExc_TypeError,
    4983             :                         "character mapping must return integer, None or unicode");
    4984           0 :         Py_DECREF(x);
    4985           0 :         return -1;
    4986             :     }
    4987             : }
    4988             : /* ensure that *outobj is at least requiredsize characters long,
    4989             :    if not reallocate and adjust various state variables.
    4990             :    Return 0 on success, -1 on error */
    4991             : static
    4992           0 : int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
    4993             :                                Py_ssize_t requiredsize)
    4994             : {
    4995           0 :     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
    4996           0 :     if (requiredsize > oldsize) {
    4997             :         /* remember old output position */
    4998           0 :         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
    4999             :         /* exponentially overallocate to minimize reallocations */
    5000           0 :         if (requiredsize < 2 * oldsize)
    5001           0 :             requiredsize = 2 * oldsize;
    5002           0 :         if (PyUnicode_Resize(outobj, requiredsize) < 0)
    5003           0 :             return -1;
    5004           0 :         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
    5005             :     }
    5006           0 :     return 0;
    5007             : }
    5008             : /* lookup the character, put the result in the output string and adjust
    5009             :    various state variables. Return a new reference to the object that
    5010             :    was put in the output buffer in *result, or Py_None, if the mapping was
    5011             :    undefined (in which case no character was written).
    5012             :    The called must decref result.
    5013             :    Return 0 on success, -1 on error. */
    5014             : static
    5015           0 : int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
    5016             :                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
    5017             :                             PyObject **res)
    5018             : {
    5019           0 :     if (charmaptranslate_lookup(*curinp, mapping, res))
    5020           0 :         return -1;
    5021           0 :     if (*res==NULL) {
    5022             :         /* not found => default to 1:1 mapping */
    5023           0 :         *(*outp)++ = *curinp;
    5024             :     }
    5025           0 :     else if (*res==Py_None)
    5026             :         ;
    5027           0 :     else if (PyInt_Check(*res)) {
    5028             :         /* no overflow check, because we know that the space is enough */
    5029           0 :         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
    5030             :     }
    5031           0 :     else if (PyUnicode_Check(*res)) {
    5032           0 :         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
    5033           0 :         if (repsize==1) {
    5034             :             /* no overflow check, because we know that the space is enough */
    5035           0 :             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
    5036             :         }
    5037           0 :         else if (repsize!=0) {
    5038             :             /* more than one character */
    5039           0 :             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
    5040           0 :                 (insize - (curinp-startinp)) +
    5041             :                 repsize - 1;
    5042           0 :             if (charmaptranslate_makespace(outobj, outp, requiredsize))
    5043           0 :                 return -1;
    5044           0 :             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
    5045           0 :             *outp += repsize;
    5046             :         }
    5047             :     }
    5048             :     else
    5049           0 :         return -1;
    5050           0 :     return 0;
    5051             : }
    5052             : 
    5053           0 : PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
    5054             :                                      Py_ssize_t size,
    5055             :                                      PyObject *mapping,
    5056             :                                      const char *errors)
    5057             : {
    5058             :     /* output object */
    5059           0 :     PyObject *res = NULL;
    5060             :     /* pointers to the beginning and end+1 of input */
    5061           0 :     const Py_UNICODE *startp = p;
    5062           0 :     const Py_UNICODE *endp = p + size;
    5063             :     /* pointer into the output */
    5064             :     Py_UNICODE *str;
    5065             :     /* current output position */
    5066           0 :     Py_ssize_t respos = 0;
    5067           0 :     char *reason = "character maps to <undefined>";
    5068           0 :     PyObject *errorHandler = NULL;
    5069           0 :     PyObject *exc = NULL;
    5070             :     /* the following variable is used for caching string comparisons
    5071             :      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
    5072             :      * 3=ignore, 4=xmlcharrefreplace */
    5073           0 :     int known_errorHandler = -1;
    5074             : 
    5075           0 :     if (mapping == NULL) {
    5076           0 :         PyErr_BadArgument();
    5077           0 :         return NULL;
    5078             :     }
    5079             : 
    5080             :     /* allocate enough for a simple 1:1 translation without
    5081             :        replacements, if we need more, we'll resize */
    5082           0 :     res = PyUnicode_FromUnicode(NULL, size);
    5083           0 :     if (res == NULL)
    5084           0 :         goto onError;
    5085           0 :     if (size == 0)
    5086           0 :         return res;
    5087           0 :     str = PyUnicode_AS_UNICODE(res);
    5088             : 
    5089           0 :     while (p<endp) {
    5090             :         /* try to encode it */
    5091           0 :         PyObject *x = NULL;
    5092           0 :         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
    5093           0 :             Py_XDECREF(x);
    5094           0 :             goto onError;
    5095             :         }
    5096           0 :         Py_XDECREF(x);
    5097           0 :         if (x!=Py_None) /* it worked => adjust input pointer */
    5098           0 :             ++p;
    5099             :         else { /* untranslatable character */
    5100           0 :             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
    5101             :             Py_ssize_t repsize;
    5102             :             Py_ssize_t newpos;
    5103             :             Py_UNICODE *uni2;
    5104             :             /* startpos for collecting untranslatable chars */
    5105           0 :             const Py_UNICODE *collstart = p;
    5106           0 :             const Py_UNICODE *collend = p+1;
    5107             :             const Py_UNICODE *coll;
    5108             : 
    5109             :             /* find all untranslatable characters */
    5110           0 :             while (collend < endp) {
    5111           0 :                 if (charmaptranslate_lookup(*collend, mapping, &x))
    5112           0 :                     goto onError;
    5113           0 :                 Py_XDECREF(x);
    5114           0 :                 if (x!=Py_None)
    5115           0 :                     break;
    5116           0 :                 ++collend;
    5117             :             }
    5118             :             /* cache callback name lookup
    5119             :              * (if not done yet, i.e. it's the first error) */
    5120           0 :             if (known_errorHandler==-1) {
    5121           0 :                 if ((errors==NULL) || (!strcmp(errors, "strict")))
    5122           0 :                     known_errorHandler = 1;
    5123           0 :                 else if (!strcmp(errors, "replace"))
    5124           0 :                     known_errorHandler = 2;
    5125           0 :                 else if (!strcmp(errors, "ignore"))
    5126           0 :                     known_errorHandler = 3;
    5127           0 :                 else if (!strcmp(errors, "xmlcharrefreplace"))
    5128           0 :                     known_errorHandler = 4;
    5129             :                 else
    5130           0 :                     known_errorHandler = 0;
    5131             :             }
    5132           0 :             switch (known_errorHandler) {
    5133             :             case 1: /* strict */
    5134           0 :                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
    5135           0 :                 goto onError;
    5136             :             case 2: /* replace */
    5137             :                 /* No need to check for space, this is a 1:1 replacement */
    5138           0 :                 for (coll = collstart; coll<collend; ++coll)
    5139           0 :                     *str++ = '?';
    5140             :                 /* fall through */
    5141             :             case 3: /* ignore */
    5142           0 :                 p = collend;
    5143           0 :                 break;
    5144             :             case 4: /* xmlcharrefreplace */
    5145             :                 /* generate replacement (temporarily (mis)uses p) */
    5146           0 :                 for (p = collstart; p < collend;) {
    5147             :                     char buffer[2+29+1+1];
    5148             :                     char *cp;
    5149           0 :                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
    5150           0 :                     sprintf(buffer, "&#%d;", (int)ch);
    5151           0 :                     if (charmaptranslate_makespace(&res, &str,
    5152           0 :                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
    5153           0 :                         goto onError;
    5154           0 :                     for (cp = buffer; *cp; ++cp)
    5155           0 :                         *str++ = *cp;
    5156             :                 }
    5157           0 :                 p = collend;
    5158           0 :                 break;
    5159             :             default:
    5160           0 :                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
    5161             :                                                                  reason, startp, size, &exc,
    5162           0 :                                                                  collstart-startp, collend-startp, &newpos);
    5163           0 :                 if (repunicode == NULL)
    5164           0 :                     goto onError;
    5165             :                 /* generate replacement  */
    5166           0 :                 repsize = PyUnicode_GET_SIZE(repunicode);
    5167           0 :                 if (charmaptranslate_makespace(&res, &str,
    5168           0 :                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
    5169           0 :                     Py_DECREF(repunicode);
    5170           0 :                     goto onError;
    5171             :                 }
    5172           0 :                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
    5173           0 :                     *str++ = *uni2;
    5174           0 :                 p = startp + newpos;
    5175           0 :                 Py_DECREF(repunicode);
    5176             :             }
    5177             :         }
    5178             :     }
    5179             :     /* Resize if we allocated to much */
    5180           0 :     respos = str-PyUnicode_AS_UNICODE(res);
    5181           0 :     if (respos<PyUnicode_GET_SIZE(res)) {
    5182           0 :         if (PyUnicode_Resize(&res, respos) < 0)
    5183           0 :             goto onError;
    5184             :     }
    5185           0 :     Py_XDECREF(exc);
    5186           0 :     Py_XDECREF(errorHandler);
    5187           0 :     return res;
    5188             : 
    5189             :   onError:
    5190           0 :     Py_XDECREF(res);
    5191           0 :     Py_XDECREF(exc);
    5192           0 :     Py_XDECREF(errorHandler);
    5193           0 :     return NULL;
    5194             : }
    5195             : 
    5196           0 : PyObject *PyUnicode_Translate(PyObject *str,
    5197             :                               PyObject *mapping,
    5198             :                               const char *errors)
    5199             : {
    5200             :     PyObject *result;
    5201             : 
    5202           0 :     str = PyUnicode_FromObject(str);
    5203           0 :     if (str == NULL)
    5204           0 :         goto onError;
    5205           0 :     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
    5206             :                                         PyUnicode_GET_SIZE(str),
    5207             :                                         mapping,
    5208             :                                         errors);
    5209           0 :     Py_DECREF(str);
    5210           0 :     return result;
    5211             : 
    5212             :   onError:
    5213           0 :     Py_XDECREF(str);
    5214           0 :     return NULL;
    5215             : }
    5216             : 
    5217             : /* --- Decimal Encoder ---------------------------------------------------- */
    5218             : 
    5219           0 : int PyUnicode_EncodeDecimal(Py_UNICODE *s,
    5220             :                             Py_ssize_t length,
    5221             :                             char *output,
    5222             :                             const char *errors)
    5223             : {
    5224             :     Py_UNICODE *p, *end;
    5225           0 :     PyObject *errorHandler = NULL;
    5226           0 :     PyObject *exc = NULL;
    5227           0 :     const char *encoding = "decimal";
    5228           0 :     const char *reason = "invalid decimal Unicode string";
    5229             :     /* the following variable is used for caching string comparisons
    5230             :      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
    5231           0 :     int known_errorHandler = -1;
    5232             : 
    5233           0 :     if (output == NULL) {
    5234           0 :         PyErr_BadArgument();
    5235           0 :         return -1;
    5236             :     }
    5237             : 
    5238           0 :     p = s;
    5239           0 :     end = s + length;
    5240           0 :     while (p < end) {
    5241           0 :         register Py_UNICODE ch = *p;
    5242             :         int decimal;
    5243             :         PyObject *repunicode;
    5244             :         Py_ssize_t repsize;
    5245             :         Py_ssize_t newpos;
    5246             :         Py_UNICODE *uni2;
    5247             :         Py_UNICODE *collstart;
    5248             :         Py_UNICODE *collend;
    5249             : 
    5250           0 :         if (Py_UNICODE_ISSPACE(ch)) {
    5251           0 :             *output++ = ' ';
    5252           0 :             ++p;
    5253           0 :             continue;
    5254             :         }
    5255           0 :         decimal = Py_UNICODE_TODECIMAL(ch);
    5256           0 :         if (decimal >= 0) {
    5257           0 :             *output++ = '0' + decimal;
    5258           0 :             ++p;
    5259           0 :             continue;
    5260             :         }
    5261           0 :         if (0 < ch && ch < 256) {
    5262           0 :             *output++ = (char)ch;
    5263           0 :             ++p;
    5264           0 :             continue;
    5265             :         }
    5266             :         /* All other characters are considered unencodable */
    5267           0 :         collstart = p;
    5268           0 :         for (collend = p+1; collend < end; collend++) {
    5269           0 :             if ((0 < *collend && *collend < 256) ||
    5270           0 :                 Py_UNICODE_ISSPACE(*collend) ||
    5271           0 :                 0 <= Py_UNICODE_TODECIMAL(*collend))
    5272             :                 break;
    5273             :         }
    5274             :         /* cache callback name lookup
    5275             :          * (if not done yet, i.e. it's the first error) */
    5276           0 :         if (known_errorHandler==-1) {
    5277           0 :             if ((errors==NULL) || (!strcmp(errors, "strict")))
    5278           0 :                 known_errorHandler = 1;
    5279           0 :             else if (!strcmp(errors, "replace"))
    5280           0 :                 known_errorHandler = 2;
    5281           0 :             else if (!strcmp(errors, "ignore"))
    5282           0 :                 known_errorHandler = 3;
    5283           0 :             else if (!strcmp(errors, "xmlcharrefreplace"))
    5284           0 :                 known_errorHandler = 4;
    5285             :             else
    5286           0 :                 known_errorHandler = 0;
    5287             :         }
    5288           0 :         switch (known_errorHandler) {
    5289             :         case 1: /* strict */
    5290           0 :             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
    5291           0 :             goto onError;
    5292             :         case 2: /* replace */
    5293           0 :             for (p = collstart; p < collend; ++p)
    5294           0 :                 *output++ = '?';
    5295             :             /* fall through */
    5296             :         case 3: /* ignore */
    5297           0 :             p = collend;
    5298           0 :             break;
    5299             :         case 4: /* xmlcharrefreplace */
    5300             :             /* generate replacement (temporarily (mis)uses p) */
    5301           0 :             for (p = collstart; p < collend;) {
    5302           0 :                 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
    5303           0 :                 output += sprintf(output, "&#%d;", ch);
    5304             :             }
    5305           0 :             p = collend;
    5306           0 :             break;
    5307             :         default:
    5308           0 :             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
    5309             :                                                           encoding, reason, s, length, &exc,
    5310           0 :                                                           collstart-s, collend-s, &newpos);
    5311           0 :             if (repunicode == NULL)
    5312           0 :                 goto onError;
    5313             :             /* generate replacement  */
    5314           0 :             repsize = PyUnicode_GET_SIZE(repunicode);
    5315           0 :             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
    5316           0 :                 Py_UNICODE ch = *uni2;
    5317           0 :                 if (Py_UNICODE_ISSPACE(ch))
    5318           0 :                     *output++ = ' ';
    5319             :                 else {
    5320           0 :                     decimal = Py_UNICODE_TODECIMAL(ch);
    5321           0 :                     if (decimal >= 0)
    5322           0 :                         *output++ = '0' + decimal;
    5323           0 :                     else if (0 < ch && ch < 256)
    5324           0 :                         *output++ = (char)ch;
    5325             :                     else {
    5326           0 :                         Py_DECREF(repunicode);
    5327           0 :                         raise_encode_exception(&exc, encoding,
    5328           0 :                                                s, length, collstart-s, collend-s, reason);
    5329           0 :                         goto onError;
    5330             :                     }
    5331             :                 }
    5332             :             }
    5333           0 :             p = s + newpos;
    5334           0 :             Py_DECREF(repunicode);
    5335             :         }
    5336             :     }
    5337             :     /* 0-terminate the output string */
    5338           0 :     *output++ = '\0';
    5339           0 :     Py_XDECREF(exc);
    5340           0 :     Py_XDECREF(errorHandler);
    5341           0 :     return 0;
    5342             : 
    5343             :   onError:
    5344           0 :     Py_XDECREF(exc);
    5345           0 :     Py_XDECREF(errorHandler);
    5346           0 :     return -1;
    5347             : }
    5348             : 
    5349             : /* --- Helpers ------------------------------------------------------------ */
    5350             : 
    5351             : #include "stringlib/unicodedefs.h"
    5352             : #include "stringlib/fastsearch.h"
    5353             : 
    5354             : #include "stringlib/count.h"
    5355             : #include "stringlib/find.h"
    5356             : #include "stringlib/partition.h"
    5357             : #include "stringlib/split.h"
    5358             : 
    5359             : /* helper macro to fixup start/end slice values */
    5360             : #define ADJUST_INDICES(start, end, len)         \
    5361             :     if (end > len)                              \
    5362             :         end = len;                              \
    5363             :     else if (end < 0) {                         \
    5364             :         end += len;                             \
    5365             :         if (end < 0)                            \
    5366             :             end = 0;                            \
    5367             :     }                                           \
    5368             :     if (start < 0) {                            \
    5369             :         start += len;                           \
    5370             :         if (start < 0)                          \
    5371             :             start = 0;                          \
    5372             :     }
    5373             : 
    5374           0 : Py_ssize_t PyUnicode_Count(PyObject *str,
    5375             :                            PyObject *substr,
    5376             :                            Py_ssize_t start,
    5377             :                            Py_ssize_t end)
    5378             : {
    5379             :     Py_ssize_t result;
    5380             :     PyUnicodeObject* str_obj;
    5381             :     PyUnicodeObject* sub_obj;
    5382             : 
    5383           0 :     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
    5384           0 :     if (!str_obj)
    5385           0 :         return -1;
    5386           0 :     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
    5387           0 :     if (!sub_obj) {
    5388           0 :         Py_DECREF(str_obj);
    5389           0 :         return -1;
    5390             :     }
    5391             : 
    5392           0 :     ADJUST_INDICES(start, end, str_obj->length);
    5393           0 :     result = stringlib_count(
    5394           0 :         str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
    5395             :         PY_SSIZE_T_MAX
    5396             :         );
    5397             : 
    5398           0 :     Py_DECREF(sub_obj);
    5399           0 :     Py_DECREF(str_obj);
    5400             : 
    5401           0 :     return result;
    5402             : }
    5403             : 
    5404           0 : Py_ssize_t PyUnicode_Find(PyObject *str,
    5405             :                           PyObject *sub,
    5406             :                           Py_ssize_t start,
    5407             :                           Py_ssize_t end,
    5408             :                           int direction)
    5409             : {
    5410             :     Py_ssize_t result;
    5411             : 
    5412           0 :     str = PyUnicode_FromObject(str);
    5413           0 :     if (!str)
    5414           0 :         return -2;
    5415           0 :     sub = PyUnicode_FromObject(sub);
    5416           0 :     if (!sub) {
    5417           0 :         Py_DECREF(str);
    5418           0 :         return -2;
    5419             :     }
    5420             : 
    5421           0 :     if (direction > 0)
    5422           0 :         result = stringlib_find_slice(
    5423           0 :             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
    5424           0 :             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
    5425             :             start, end
    5426             :             );
    5427             :     else
    5428           0 :         result = stringlib_rfind_slice(
    5429           0 :             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
    5430           0 :             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
    5431             :             start, end
    5432             :             );
    5433             : 
    5434           0 :     Py_DECREF(str);
    5435           0 :     Py_DECREF(sub);
    5436             : 
    5437           0 :     return result;
    5438             : }
    5439             : 
    5440             : static
    5441           0 : int tailmatch(PyUnicodeObject *self,
    5442             :               PyUnicodeObject *substring,
    5443             :               Py_ssize_t start,
    5444             :               Py_ssize_t end,
    5445             :               int direction)
    5446             : {
    5447           0 :     if (substring->length == 0)
    5448           0 :         return 1;
    5449             : 
    5450           0 :     ADJUST_INDICES(start, end, self->length);
    5451           0 :     end -= substring->length;
    5452           0 :     if (end < start)
    5453           0 :         return 0;
    5454             : 
    5455           0 :     if (direction > 0) {
    5456           0 :         if (Py_UNICODE_MATCH(self, end, substring))
    5457           0 :             return 1;
    5458             :     } else {
    5459           0 :         if (Py_UNICODE_MATCH(self, start, substring))
    5460           0 :             return 1;
    5461             :     }
    5462             : 
    5463           0 :     return 0;
    5464             : }
    5465             : 
    5466           0 : Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
    5467             :                                PyObject *substr,
    5468             :                                Py_ssize_t start,
    5469             :                                Py_ssize_t end,
    5470             :                                int direction)
    5471             : {
    5472             :     Py_ssize_t result;
    5473             : 
    5474           0 :     str = PyUnicode_FromObject(str);
    5475           0 :     if (str == NULL)
    5476           0 :         return -1;
    5477           0 :     substr = PyUnicode_FromObject(substr);
    5478           0 :     if (substr == NULL) {
    5479           0 :         Py_DECREF(str);
    5480           0 :         return -1;
    5481             :     }
    5482             : 
    5483           0 :     result = tailmatch((PyUnicodeObject *)str,
    5484             :                        (PyUnicodeObject *)substr,
    5485             :                        start, end, direction);
    5486           0 :     Py_DECREF(str);
    5487           0 :     Py_DECREF(substr);
    5488           0 :     return result;
    5489             : }
    5490             : 
    5491             : /* Apply fixfct filter to the Unicode object self and return a
    5492             :    reference to the modified object */
    5493             : 
    5494             : static
    5495           0 : PyObject *fixup(PyUnicodeObject *self,
    5496             :                 int (*fixfct)(PyUnicodeObject *s))
    5497             : {
    5498             : 
    5499             :     PyUnicodeObject *u;
    5500             : 
    5501           0 :     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
    5502           0 :     if (u == NULL)
    5503           0 :         return NULL;
    5504             : 
    5505           0 :     Py_UNICODE_COPY(u->str, self->str, self->length);
    5506             : 
    5507           0 :     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
    5508             :         /* fixfct should return TRUE if it modified the buffer. If
    5509             :            FALSE, return a reference to the original buffer instead
    5510             :            (to save space, not time) */
    5511           0 :         Py_INCREF(self);
    5512           0 :         Py_DECREF(u);
    5513           0 :         return (PyObject*) self;
    5514             :     }
    5515           0 :     return (PyObject*) u;
    5516             : }
    5517             : 
    5518             : static
    5519           0 : int fixupper(PyUnicodeObject *self)
    5520             : {
    5521           0 :     Py_ssize_t len = self->length;
    5522           0 :     Py_UNICODE *s = self->str;
    5523           0 :     int status = 0;
    5524             : 
    5525           0 :     while (len-- > 0) {
    5526             :         register Py_UNICODE ch;
    5527             : 
    5528           0 :         ch = Py_UNICODE_TOUPPER(*s);
    5529           0 :         if (ch != *s) {
    5530           0 :             status = 1;
    5531           0 :             *s = ch;
    5532             :         }
    5533           0 :         s++;
    5534             :     }
    5535             : 
    5536           0 :     return status;
    5537             : }
    5538             : 
    5539             : static
    5540           0 : int fixlower(PyUnicodeObject *self)
    5541             : {
    5542           0 :     Py_ssize_t len = self->length;
    5543           0 :     Py_UNICODE *s = self->str;
    5544           0 :     int status = 0;
    5545             : 
    5546           0 :     while (len-- > 0) {
    5547             :         register Py_UNICODE ch;
    5548             : 
    5549           0 :         ch = Py_UNICODE_TOLOWER(*s);
    5550           0 :         if (ch != *s) {
    5551           0 :             status = 1;
    5552           0 :             *s = ch;
    5553             :         }
    5554           0 :         s++;
    5555             :     }
    5556             : 
    5557           0 :     return status;
    5558             : }
    5559             : 
    5560             : static
    5561           0 : int fixswapcase(PyUnicodeObject *self)
    5562             : {
    5563           0 :     Py_ssize_t len = self->length;
    5564           0 :     Py_UNICODE *s = self->str;
    5565           0 :     int status = 0;
    5566             : 
    5567           0 :     while (len-- > 0) {
    5568           0 :         if (Py_UNICODE_ISUPPER(*s)) {
    5569           0 :             *s = Py_UNICODE_TOLOWER(*s);
    5570           0 :             status = 1;
    5571           0 :         } else if (Py_UNICODE_ISLOWER(*s)) {
    5572           0 :             *s = Py_UNICODE_TOUPPER(*s);
    5573           0 :             status = 1;
    5574             :         }
    5575           0 :         s++;
    5576             :     }
    5577             : 
    5578           0 :     return status;
    5579             : }
    5580             : 
    5581             : static
    5582           0 : int fixcapitalize(PyUnicodeObject *self)
    5583             : {
    5584           0 :     Py_ssize_t len = self->length;
    5585           0 :     Py_UNICODE *s = self->str;
    5586           0 :     int status = 0;
    5587             : 
    5588           0 :     if (len == 0)
    5589           0 :         return 0;
    5590           0 :     if (!Py_UNICODE_ISUPPER(*s)) {
    5591           0 :         *s = Py_UNICODE_TOUPPER(*s);
    5592           0 :         status = 1;
    5593             :     }
    5594           0 :     s++;
    5595           0 :     while (--len > 0) {
    5596           0 :         if (!Py_UNICODE_ISLOWER(*s)) {
    5597           0 :             *s = Py_UNICODE_TOLOWER(*s);
    5598           0 :             status = 1;
    5599             :         }
    5600           0 :         s++;
    5601             :     }
    5602           0 :     return status;
    5603             : }
    5604             : 
    5605             : static
    5606           0 : int fixtitle(PyUnicodeObject *self)
    5607             : {
    5608           0 :     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    5609             :     register Py_UNICODE *e;
    5610             :     int previous_is_cased;
    5611             : 
    5612             :     /* Shortcut for single character strings */
    5613           0 :     if (PyUnicode_GET_SIZE(self) == 1) {
    5614           0 :         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
    5615           0 :         if (*p != ch) {
    5616           0 :             *p = ch;
    5617           0 :             return 1;
    5618             :         }
    5619             :         else
    5620           0 :             return 0;
    5621             :     }
    5622             : 
    5623           0 :     e = p + PyUnicode_GET_SIZE(self);
    5624           0 :     previous_is_cased = 0;
    5625           0 :     for (; p < e; p++) {
    5626           0 :         register const Py_UNICODE ch = *p;
    5627             : 
    5628           0 :         if (previous_is_cased)
    5629           0 :             *p = Py_UNICODE_TOLOWER(ch);
    5630             :         else
    5631           0 :             *p = Py_UNICODE_TOTITLE(ch);
    5632             : 
    5633           0 :         if (Py_UNICODE_ISLOWER(ch) ||
    5634           0 :             Py_UNICODE_ISUPPER(ch) ||
    5635           0 :             Py_UNICODE_ISTITLE(ch))
    5636           0 :             previous_is_cased = 1;
    5637             :         else
    5638           0 :             previous_is_cased = 0;
    5639             :     }
    5640           0 :     return 1;
    5641             : }
    5642             : 
    5643             : PyObject *
    5644           0 : PyUnicode_Join(PyObject *separator, PyObject *seq)
    5645             : {
    5646           0 :     PyObject *internal_separator = NULL;
    5647           0 :     const Py_UNICODE blank = ' ';
    5648           0 :     const Py_UNICODE *sep = &blank;
    5649           0 :     Py_ssize_t seplen = 1;
    5650           0 :     PyUnicodeObject *res = NULL; /* the result */
    5651           0 :     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
    5652             :     Py_ssize_t res_used;         /* # used bytes */
    5653             :     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
    5654             :     PyObject *fseq;          /* PySequence_Fast(seq) */
    5655             :     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
    5656             :     PyObject *item;
    5657             :     Py_ssize_t i;
    5658             : 
    5659           0 :     fseq = PySequence_Fast(seq, "can only join an iterable");
    5660           0 :     if (fseq == NULL) {
    5661           0 :         return NULL;
    5662             :     }
    5663             : 
    5664             :     /* Grrrr.  A codec may be invoked to convert str objects to
    5665             :      * Unicode, and so it's possible to call back into Python code
    5666             :      * during PyUnicode_FromObject(), and so it's possible for a sick
    5667             :      * codec to change the size of fseq (if seq is a list).  Therefore
    5668             :      * we have to keep refetching the size -- can't assume seqlen
    5669             :      * is invariant.
    5670             :      */
    5671           0 :     seqlen = PySequence_Fast_GET_SIZE(fseq);
    5672             :     /* If empty sequence, return u"". */
    5673           0 :     if (seqlen == 0) {
    5674           0 :         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
    5675           0 :         goto Done;
    5676             :     }
    5677             :     /* If singleton sequence with an exact Unicode, return that. */
    5678           0 :     if (seqlen == 1) {
    5679           0 :         item = PySequence_Fast_GET_ITEM(fseq, 0);
    5680           0 :         if (PyUnicode_CheckExact(item)) {
    5681           0 :             Py_INCREF(item);
    5682           0 :             res = (PyUnicodeObject *)item;
    5683           0 :             goto Done;
    5684             :         }
    5685             :     }
    5686             : 
    5687             :     /* At least two items to join, or one that isn't exact Unicode. */
    5688           0 :     if (seqlen > 1) {
    5689             :         /* Set up sep and seplen -- they're needed. */
    5690           0 :         if (separator == NULL) {
    5691           0 :             sep = &blank;
    5692           0 :             seplen = 1;
    5693             :         }
    5694             :         else {
    5695           0 :             internal_separator = PyUnicode_FromObject(separator);
    5696           0 :             if (internal_separator == NULL)
    5697           0 :                 goto onError;
    5698           0 :             sep = PyUnicode_AS_UNICODE(internal_separator);
    5699           0 :             seplen = PyUnicode_GET_SIZE(internal_separator);
    5700             :             /* In case PyUnicode_FromObject() mutated seq. */
    5701           0 :             seqlen = PySequence_Fast_GET_SIZE(fseq);
    5702             :         }
    5703             :     }
    5704             : 
    5705             :     /* Get space. */
    5706           0 :     res = _PyUnicode_New(res_alloc);
    5707           0 :     if (res == NULL)
    5708           0 :         goto onError;
    5709           0 :     res_p = PyUnicode_AS_UNICODE(res);
    5710           0 :     res_used = 0;
    5711             : 
    5712           0 :     for (i = 0; i < seqlen; ++i) {
    5713             :         Py_ssize_t itemlen;
    5714             :         Py_ssize_t new_res_used;
    5715             : 
    5716           0 :         item = PySequence_Fast_GET_ITEM(fseq, i);
    5717             :         /* Convert item to Unicode. */
    5718           0 :         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
    5719           0 :             PyErr_Format(PyExc_TypeError,
    5720             :                          "sequence item %zd: expected string or Unicode,"
    5721             :                          " %.80s found",
    5722           0 :                          i, Py_TYPE(item)->tp_name);
    5723           0 :             goto onError;
    5724             :         }
    5725           0 :         item = PyUnicode_FromObject(item);
    5726           0 :         if (item == NULL)
    5727           0 :             goto onError;
    5728             :         /* We own a reference to item from here on. */
    5729             : 
    5730             :         /* In case PyUnicode_FromObject() mutated seq. */
    5731           0 :         seqlen = PySequence_Fast_GET_SIZE(fseq);
    5732             : 
    5733             :         /* Make sure we have enough space for the separator and the item. */
    5734           0 :         itemlen = PyUnicode_GET_SIZE(item);
    5735           0 :         new_res_used = res_used + itemlen;
    5736           0 :         if (new_res_used < 0)
    5737           0 :             goto Overflow;
    5738           0 :         if (i < seqlen - 1) {
    5739           0 :             new_res_used += seplen;
    5740           0 :             if (new_res_used < 0)
    5741           0 :                 goto Overflow;
    5742             :         }
    5743           0 :         if (new_res_used > res_alloc) {
    5744             :             /* double allocated size until it's big enough */
    5745             :             do {
    5746           0 :                 res_alloc += res_alloc;
    5747           0 :                 if (res_alloc <= 0)
    5748           0 :                     goto Overflow;
    5749           0 :             } while (new_res_used > res_alloc);
    5750           0 :             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
    5751           0 :                 Py_DECREF(item);
    5752           0 :                 goto onError;
    5753             :             }
    5754           0 :             res_p = PyUnicode_AS_UNICODE(res) + res_used;
    5755             :         }
    5756             : 
    5757             :         /* Copy item, and maybe the separator. */
    5758           0 :         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
    5759           0 :         res_p += itemlen;
    5760           0 :         if (i < seqlen - 1) {
    5761           0 :             Py_UNICODE_COPY(res_p, sep, seplen);
    5762           0 :             res_p += seplen;
    5763             :         }
    5764           0 :         Py_DECREF(item);
    5765           0 :         res_used = new_res_used;
    5766             :     }
    5767             : 
    5768             :     /* Shrink res to match the used area; this probably can't fail,
    5769             :      * but it's cheap to check.
    5770             :      */
    5771           0 :     if (_PyUnicode_Resize(&res, res_used) < 0)
    5772           0 :         goto onError;
    5773             : 
    5774             :   Done:
    5775           0 :     Py_XDECREF(internal_separator);
    5776           0 :     Py_DECREF(fseq);
    5777           0 :     return (PyObject *)res;
    5778             : 
    5779             :   Overflow:
    5780           0 :     PyErr_SetString(PyExc_OverflowError,
    5781             :                     "join() result is too long for a Python string");
    5782           0 :     Py_DECREF(item);
    5783             :     /* fall through */
    5784             : 
    5785             :   onError:
    5786           0 :     Py_XDECREF(internal_separator);
    5787           0 :     Py_DECREF(fseq);
    5788           0 :     Py_XDECREF(res);
    5789           0 :     return NULL;
    5790             : }
    5791             : 
    5792             : static
    5793           0 : PyUnicodeObject *pad(PyUnicodeObject *self,
    5794             :                      Py_ssize_t left,
    5795             :                      Py_ssize_t right,
    5796             :                      Py_UNICODE fill)
    5797             : {
    5798             :     PyUnicodeObject *u;
    5799             : 
    5800           0 :     if (left < 0)
    5801           0 :         left = 0;
    5802           0 :     if (right < 0)
    5803           0 :         right = 0;
    5804             : 
    5805           0 :     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
    5806           0 :         Py_INCREF(self);
    5807           0 :         return self;
    5808             :     }
    5809             : 
    5810           0 :     if (left > PY_SSIZE_T_MAX - self->length ||
    5811           0 :         right > PY_SSIZE_T_MAX - (left + self->length)) {
    5812           0 :         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
    5813           0 :         return NULL;
    5814             :     }
    5815           0 :     u = _PyUnicode_New(left + self->length + right);
    5816           0 :     if (u) {
    5817           0 :         if (left)
    5818           0 :             Py_UNICODE_FILL(u->str, fill, left);
    5819           0 :         Py_UNICODE_COPY(u->str + left, self->str, self->length);
    5820           0 :         if (right)
    5821           0 :             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
    5822             :     }
    5823             : 
    5824           0 :     return u;
    5825             : }
    5826             : 
    5827           0 : PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
    5828             : {
    5829             :     PyObject *list;
    5830             : 
    5831           0 :     string = PyUnicode_FromObject(string);
    5832           0 :     if (string == NULL)
    5833           0 :         return NULL;
    5834             : 
    5835           0 :     list = stringlib_splitlines(
    5836           0 :         (PyObject*) string, PyUnicode_AS_UNICODE(string),
    5837             :         PyUnicode_GET_SIZE(string), keepends);
    5838             : 
    5839           0 :     Py_DECREF(string);
    5840           0 :     return list;
    5841             : }
    5842             : 
    5843             : static
    5844           0 : PyObject *split(PyUnicodeObject *self,
    5845             :                 PyUnicodeObject *substring,
    5846             :                 Py_ssize_t maxcount)
    5847             : {
    5848           0 :     if (maxcount < 0)
    5849           0 :         maxcount = PY_SSIZE_T_MAX;
    5850             : 
    5851           0 :     if (substring == NULL)
    5852           0 :         return stringlib_split_whitespace(
    5853           0 :             (PyObject*) self,  self->str, self->length, maxcount
    5854             :             );
    5855             : 
    5856           0 :     return stringlib_split(
    5857           0 :         (PyObject*) self,  self->str, self->length,
    5858           0 :         substring->str, substring->length,
    5859             :         maxcount
    5860             :         );
    5861             : }
    5862             : 
    5863             : static
    5864           0 : PyObject *rsplit(PyUnicodeObject *self,
    5865             :                  PyUnicodeObject *substring,
    5866             :                  Py_ssize_t maxcount)
    5867             : {
    5868           0 :     if (maxcount < 0)
    5869           0 :         maxcount = PY_SSIZE_T_MAX;
    5870             : 
    5871           0 :     if (substring == NULL)
    5872           0 :         return stringlib_rsplit_whitespace(
    5873           0 :             (PyObject*) self,  self->str, self->length, maxcount
    5874             :             );
    5875             : 
    5876           0 :     return stringlib_rsplit(
    5877           0 :         (PyObject*) self,  self->str, self->length,
    5878           0 :         substring->str, substring->length,
    5879             :         maxcount
    5880             :         );
    5881             : }
    5882             : 
    5883             : static
    5884           0 : PyObject *replace(PyUnicodeObject *self,
    5885             :                   PyUnicodeObject *str1,
    5886             :                   PyUnicodeObject *str2,
    5887             :                   Py_ssize_t maxcount)
    5888             : {
    5889             :     PyUnicodeObject *u;
    5890             : 
    5891           0 :     if (maxcount < 0)
    5892           0 :         maxcount = PY_SSIZE_T_MAX;
    5893           0 :     else if (maxcount == 0 || self->length == 0)
    5894             :         goto nothing;
    5895             : 
    5896           0 :     if (str1->length == str2->length) {
    5897             :         Py_ssize_t i;
    5898             :         /* same length */
    5899           0 :         if (str1->length == 0)
    5900           0 :             goto nothing;
    5901           0 :         if (str1->length == 1) {
    5902             :             /* replace characters */
    5903             :             Py_UNICODE u1, u2;
    5904           0 :             if (!findchar(self->str, self->length, str1->str[0]))
    5905           0 :                 goto nothing;
    5906           0 :             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
    5907           0 :             if (!u)
    5908           0 :                 return NULL;
    5909           0 :             Py_UNICODE_COPY(u->str, self->str, self->length);
    5910           0 :             u1 = str1->str[0];
    5911           0 :             u2 = str2->str[0];
    5912           0 :             for (i = 0; i < u->length; i++)
    5913           0 :                 if (u->str[i] == u1) {
    5914           0 :                     if (--maxcount < 0)
    5915           0 :                         break;
    5916           0 :                     u->str[i] = u2;
    5917             :                 }
    5918             :         } else {
    5919           0 :             i = stringlib_find(
    5920           0 :                 self->str, self->length, str1->str, str1->length, 0
    5921             :                 );
    5922           0 :             if (i < 0)
    5923           0 :                 goto nothing;
    5924           0 :             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
    5925           0 :             if (!u)
    5926           0 :                 return NULL;
    5927           0 :             Py_UNICODE_COPY(u->str, self->str, self->length);
    5928             : 
    5929             :             /* change everything in-place, starting with this one */
    5930           0 :             Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
    5931           0 :             i += str1->length;
    5932             : 
    5933           0 :             while ( --maxcount > 0) {
    5934           0 :                 i = stringlib_find(self->str+i, self->length-i,
    5935           0 :                                    str1->str, str1->length,
    5936             :                                    i);
    5937           0 :                 if (i == -1)
    5938           0 :                     break;
    5939           0 :                 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
    5940           0 :                 i += str1->length;
    5941             :             }
    5942             :         }
    5943             :     } else {
    5944             : 
    5945             :         Py_ssize_t n, i, j;
    5946             :         Py_ssize_t product, new_size, delta;
    5947             :         Py_UNICODE *p;
    5948             : 
    5949             :         /* replace strings */
    5950           0 :         n = stringlib_count(self->str, self->length, str1->str, str1->length,
    5951             :                             maxcount);
    5952           0 :         if (n == 0)
    5953           0 :             goto nothing;
    5954             :         /* new_size = self->length + n * (str2->length - str1->length)); */
    5955           0 :         delta = (str2->length - str1->length);
    5956           0 :         if (delta == 0) {
    5957           0 :             new_size = self->length;
    5958             :         } else {
    5959           0 :             product = n * (str2->length - str1->length);
    5960           0 :             if ((product / (str2->length - str1->length)) != n) {
    5961           0 :                 PyErr_SetString(PyExc_OverflowError,
    5962             :                                 "replace string is too long");
    5963           0 :                 return NULL;
    5964             :             }
    5965           0 :             new_size = self->length + product;
    5966           0 :             if (new_size < 0) {
    5967           0 :                 PyErr_SetString(PyExc_OverflowError,
    5968             :                                 "replace string is too long");
    5969           0 :                 return NULL;
    5970             :             }
    5971             :         }
    5972           0 :         u = _PyUnicode_New(new_size);
    5973           0 :         if (!u)
    5974           0 :             return NULL;
    5975           0 :         i = 0;
    5976           0 :         p = u->str;
    5977           0 :         if (str1->length > 0) {
    5978           0 :             while (n-- > 0) {
    5979             :                 /* look for next match */
    5980           0 :                 j = stringlib_find(self->str+i, self->length-i,
    5981           0 :                                    str1->str, str1->length,
    5982             :                                    i);
    5983           0 :                 if (j == -1)
    5984           0 :                     break;
    5985           0 :                 else if (j > i) {
    5986             :                     /* copy unchanged part [i:j] */
    5987           0 :                     Py_UNICODE_COPY(p, self->str+i, j-i);
    5988           0 :                     p += j - i;
    5989             :                 }
    5990             :                 /* copy substitution string */
    5991           0 :                 if (str2->length > 0) {
    5992           0 :                     Py_UNICODE_COPY(p, str2->str, str2->length);
    5993           0 :                     p += str2->length;
    5994             :                 }
    5995           0 :                 i = j + str1->length;
    5996             :             }
    5997           0 :             if (i < self->length)
    5998             :                 /* copy tail [i:] */
    5999           0 :                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
    6000             :         } else {
    6001             :             /* interleave */
    6002           0 :             while (n > 0) {
    6003           0 :                 Py_UNICODE_COPY(p, str2->str, str2->length);
    6004           0 :                 p += str2->length;
    6005           0 :                 if (--n <= 0)
    6006           0 :                     break;
    6007           0 :                 *p++ = self->str[i++];
    6008             :             }
    6009           0 :             Py_UNICODE_COPY(p, self->str+i, self->length-i);
    6010             :         }
    6011             :     }
    6012           0 :     return (PyObject *) u;
    6013             : 
    6014             :   nothing:
    6015             :     /* nothing to replace; return original string (when possible) */
    6016           0 :     if (PyUnicode_CheckExact(self)) {
    6017           0 :         Py_INCREF(self);
    6018           0 :         return (PyObject *) self;
    6019             :     }
    6020           0 :     return PyUnicode_FromUnicode(self->str, self->length);
    6021             : }
    6022             : 
    6023             : /* --- Unicode Object Methods --------------------------------------------- */
    6024             : 
    6025             : PyDoc_STRVAR(title__doc__,
    6026             :              "S.title() -> unicode\n\
    6027             : \n\
    6028             : Return a titlecased version of S, i.e. words start with title case\n\
    6029             : characters, all remaining cased characters have lower case.");
    6030             : 
    6031             : static PyObject*
    6032           0 : unicode_title(PyUnicodeObject *self)
    6033             : {
    6034           0 :     return fixup(self, fixtitle);
    6035             : }
    6036             : 
    6037             : PyDoc_STRVAR(capitalize__doc__,
    6038             :              "S.capitalize() -> unicode\n\
    6039             : \n\
    6040             : Return a capitalized version of S, i.e. make the first character\n\
    6041             : have upper case and the rest lower case.");
    6042             : 
    6043             : static PyObject*
    6044           0 : unicode_capitalize(PyUnicodeObject *self)
    6045             : {
    6046           0 :     return fixup(self, fixcapitalize);
    6047             : }
    6048             : 
    6049             : #if 0
    6050             : PyDoc_STRVAR(capwords__doc__,
    6051             :              "S.capwords() -> unicode\n\
    6052             : \n\
    6053             : Apply .capitalize() to all words in S and return the result with\n\
    6054             : normalized whitespace (all whitespace strings are replaced by ' ').");
    6055             : 
    6056             : static PyObject*
    6057             : unicode_capwords(PyUnicodeObject *self)
    6058             : {
    6059             :     PyObject *list;
    6060             :     PyObject *item;
    6061             :     Py_ssize_t i;
    6062             : 
    6063             :     /* Split into words */
    6064             :     list = split(self, NULL, -1);
    6065             :     if (!list)
    6066             :         return NULL;
    6067             : 
    6068             :     /* Capitalize each word */
    6069             :     for (i = 0; i < PyList_GET_SIZE(list); i++) {
    6070             :         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
    6071             :                      fixcapitalize);
    6072             :         if (item == NULL)
    6073             :             goto onError;
    6074             :         Py_DECREF(PyList_GET_ITEM(list, i));
    6075             :         PyList_SET_ITEM(list, i, item);
    6076             :     }
    6077             : 
    6078             :     /* Join the words to form a new string */
    6079             :     item = PyUnicode_Join(NULL, list);
    6080             : 
    6081             :   onError:
    6082             :     Py_DECREF(list);
    6083             :     return (PyObject *)item;
    6084             : }
    6085             : #endif
    6086             : 
    6087             : /* Argument converter.  Coerces to a single unicode character */
    6088             : 
    6089             : static int
    6090           0 : convert_uc(PyObject *obj, void *addr)
    6091             : {
    6092           0 :     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
    6093             :     PyObject *uniobj;
    6094             :     Py_UNICODE *unistr;
    6095             : 
    6096           0 :     uniobj = PyUnicode_FromObject(obj);
    6097           0 :     if (uniobj == NULL) {
    6098           0 :         PyErr_SetString(PyExc_TypeError,
    6099             :                         "The fill character cannot be converted to Unicode");
    6100           0 :         return 0;
    6101             :     }
    6102           0 :     if (PyUnicode_GET_SIZE(uniobj) != 1) {
    6103           0 :         PyErr_SetString(PyExc_TypeError,
    6104             :                         "The fill character must be exactly one character long");
    6105           0 :         Py_DECREF(uniobj);
    6106           0 :         return 0;
    6107             :     }
    6108           0 :     unistr = PyUnicode_AS_UNICODE(uniobj);
    6109           0 :     *fillcharloc = unistr[0];
    6110           0 :     Py_DECREF(uniobj);
    6111           0 :     return 1;
    6112             : }
    6113             : 
    6114             : PyDoc_STRVAR(center__doc__,
    6115             :              "S.center(width[, fillchar]) -> unicode\n\
    6116             : \n\
    6117             : Return S centered in a Unicode string of length width. Padding is\n\
    6118             : done using the specified fill character (default is a space)");
    6119             : 
    6120             : static PyObject *
    6121           0 : unicode_center(PyUnicodeObject *self, PyObject *args)
    6122             : {
    6123             :     Py_ssize_t marg, left;
    6124             :     Py_ssize_t width;
    6125           0 :     Py_UNICODE fillchar = ' ';
    6126             : 
    6127           0 :     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
    6128           0 :         return NULL;
    6129             : 
    6130           0 :     if (self->length >= width && PyUnicode_CheckExact(self)) {
    6131           0 :         Py_INCREF(self);
    6132           0 :         return (PyObject*) self;
    6133             :     }
    6134             : 
    6135           0 :     marg = width - self->length;
    6136           0 :     left = marg / 2 + (marg & width & 1);
    6137             : 
    6138           0 :     return (PyObject*) pad(self, left, marg - left, fillchar);
    6139             : }
    6140             : 
    6141             : #if 0
    6142             : 
    6143             : /* This code should go into some future Unicode collation support
    6144             :    module. The basic comparison should compare ordinals on a naive
    6145             :    basis (this is what Java does and thus Jython too). */
    6146             : 
    6147             : /* speedy UTF-16 code point order comparison */
    6148             : /* gleaned from: */
    6149             : /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
    6150             : 
    6151             : static short utf16Fixup[32] =
    6152             : {
    6153             :     0, 0, 0, 0, 0, 0, 0, 0,
    6154             :     0, 0, 0, 0, 0, 0, 0, 0,
    6155             :     0, 0, 0, 0, 0, 0, 0, 0,
    6156             :     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
    6157             : };
    6158             : 
    6159             : static int
    6160             : unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
    6161             : {
    6162             :     Py_ssize_t len1, len2;
    6163             : 
    6164             :     Py_UNICODE *s1 = str1->str;
    6165             :     Py_UNICODE *s2 = str2->str;
    6166             : 
    6167             :     len1 = str1->length;
    6168             :     len2 = str2->length;
    6169             : 
    6170             :     while (len1 > 0 && len2 > 0) {
    6171             :         Py_UNICODE c1, c2;
    6172             : 
    6173             :         c1 = *s1++;
    6174             :         c2 = *s2++;
    6175             : 
    6176             :         if (c1 > (1<<11) * 26)
    6177             :             c1 += utf16Fixup[c1>>11];
    6178             :         if (c2 > (1<<11) * 26)
    6179             :             c2 += utf16Fixup[c2>>11];
    6180             :         /* now c1 and c2 are in UTF-32-compatible order */
    6181             : 
    6182             :         if (c1 != c2)
    6183             :             return (c1 < c2) ? -1 : 1;
    6184             : 
    6185             :         len1--; len2--;
    6186             :     }
    6187             : 
    6188             :     return (len1 < len2) ? -1 : (len1 != len2);
    6189             : }
    6190             : 
    6191             : #else
    6192             : 
    6193             : static int
    6194           0 : unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
    6195             : {
    6196             :     register Py_ssize_t len1, len2;
    6197             : 
    6198           0 :     Py_UNICODE *s1 = str1->str;
    6199           0 :     Py_UNICODE *s2 = str2->str;
    6200             : 
    6201           0 :     len1 = str1->length;
    6202           0 :     len2 = str2->length;
    6203             : 
    6204           0 :     while (len1 > 0 && len2 > 0) {
    6205             :         Py_UNICODE c1, c2;
    6206             : 
    6207           0 :         c1 = *s1++;
    6208           0 :         c2 = *s2++;
    6209             : 
    6210           0 :         if (c1 != c2)
    6211           0 :             return (c1 < c2) ? -1 : 1;
    6212             : 
    6213           0 :         len1--; len2--;
    6214             :     }
    6215             : 
    6216           0 :     return (len1 < len2) ? -1 : (len1 != len2);
    6217             : }
    6218             : 
    6219             : #endif
    6220             : 
    6221           0 : int PyUnicode_Compare(PyObject *left,
    6222             :                       PyObject *right)
    6223             : {
    6224           0 :     PyUnicodeObject *u = NULL, *v = NULL;
    6225             :     int result;
    6226             : 
    6227             :     /* Coerce the two arguments */
    6228           0 :     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
    6229           0 :     if (u == NULL)
    6230           0 :         goto onError;
    6231           0 :     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
    6232           0 :     if (v == NULL)
    6233           0 :         goto onError;
    6234             : 
    6235             :     /* Shortcut for empty or interned objects */
    6236           0 :     if (v == u) {
    6237           0 :         Py_DECREF(u);
    6238           0 :         Py_DECREF(v);
    6239           0 :         return 0;
    6240             :     }
    6241             : 
    6242           0 :     result = unicode_compare(u, v);
    6243             : 
    6244           0 :     Py_DECREF(u);
    6245           0 :     Py_DECREF(v);
    6246           0 :     return result;
    6247             : 
    6248             :   onError:
    6249           0 :     Py_XDECREF(u);
    6250           0 :     Py_XDECREF(v);
    6251           0 :     return -1;
    6252             : }
    6253             : 
    6254           0 : PyObject *PyUnicode_RichCompare(PyObject *left,
    6255             :                                 PyObject *right,
    6256             :                                 int op)
    6257             : {
    6258             :     int result;
    6259             : 
    6260           0 :     result = PyUnicode_Compare(left, right);
    6261           0 :     if (result == -1 && PyErr_Occurred())
    6262           0 :         goto onError;
    6263             : 
    6264             :     /* Convert the return value to a Boolean */
    6265           0 :     switch (op) {
    6266             :     case Py_EQ:
    6267           0 :         result = (result == 0);
    6268           0 :         break;
    6269             :     case Py_NE:
    6270           0 :         result = (result != 0);
    6271           0 :         break;
    6272             :     case Py_LE:
    6273           0 :         result = (result <= 0);
    6274           0 :         break;
    6275             :     case Py_GE:
    6276           0 :         result = (result >= 0);
    6277           0 :         break;
    6278             :     case Py_LT:
    6279           0 :         result = (result == -1);
    6280           0 :         break;
    6281             :     case Py_GT:
    6282           0 :         result = (result == 1);
    6283           0 :         break;
    6284             :     }
    6285           0 :     return PyBool_FromLong(result);
    6286             : 
    6287             :   onError:
    6288             : 
    6289             :     /* Standard case
    6290             : 
    6291             :        Type errors mean that PyUnicode_FromObject() could not convert
    6292             :        one of the arguments (usually the right hand side) to Unicode,
    6293             :        ie. we can't handle the comparison request. However, it is
    6294             :        possible that the other object knows a comparison method, which
    6295             :        is why we return Py_NotImplemented to give the other object a
    6296             :        chance.
    6297             : 
    6298             :     */
    6299           0 :     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
    6300           0 :         PyErr_Clear();
    6301           0 :         Py_INCREF(Py_NotImplemented);
    6302           0 :         return Py_NotImplemented;
    6303             :     }
    6304           0 :     if (op != Py_EQ && op != Py_NE)
    6305           0 :         return NULL;
    6306             : 
    6307             :     /* Equality comparison.
    6308             : 
    6309             :        This is a special case: we silence any PyExc_UnicodeDecodeError
    6310             :        and instead turn it into a PyErr_UnicodeWarning.
    6311             : 
    6312             :     */
    6313           0 :     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
    6314           0 :         return NULL;
    6315           0 :     PyErr_Clear();
    6316           0 :     if (PyErr_Warn(PyExc_UnicodeWarning,
    6317             :                    (op == Py_EQ) ?
    6318             :                    "Unicode equal comparison "
    6319             :                    "failed to convert both arguments to Unicode - "
    6320             :                    "interpreting them as being unequal" :
    6321             :                    "Unicode unequal comparison "
    6322             :                    "failed to convert both arguments to Unicode - "
    6323             :                    "interpreting them as being unequal"
    6324             :             ) < 0)
    6325           0 :         return NULL;
    6326           0 :     result = (op == Py_NE);
    6327           0 :     return PyBool_FromLong(result);
    6328             : }
    6329             : 
    6330           0 : int PyUnicode_Contains(PyObject *container,
    6331             :                        PyObject *element)
    6332             : {
    6333             :     PyObject *str, *sub;
    6334             :     int result;
    6335             : 
    6336             :     /* Coerce the two arguments */
    6337           0 :     sub = PyUnicode_FromObject(element);
    6338           0 :     if (!sub) {
    6339           0 :         return -1;
    6340             :     }
    6341             : 
    6342           0 :     str = PyUnicode_FromObject(container);
    6343           0 :     if (!str) {
    6344           0 :         Py_DECREF(sub);
    6345           0 :         return -1;
    6346             :     }
    6347             : 
    6348           0 :     result = stringlib_contains_obj(str, sub);
    6349             : 
    6350           0 :     Py_DECREF(str);
    6351           0 :     Py_DECREF(sub);
    6352             : 
    6353           0 :     return result;
    6354             : }
    6355             : 
    6356             : /* Concat to string or Unicode object giving a new Unicode object. */
    6357             : 
    6358           0 : PyObject *PyUnicode_Concat(PyObject *left,
    6359             :                            PyObject *right)
    6360             : {
    6361           0 :     PyUnicodeObject *u = NULL, *v = NULL, *w;
    6362             : 
    6363             :     /* Coerce the two arguments */
    6364           0 :     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
    6365           0 :     if (u == NULL)
    6366           0 :         goto onError;
    6367           0 :     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
    6368           0 :     if (v == NULL)
    6369           0 :         goto onError;
    6370             : 
    6371             :     /* Shortcuts */
    6372           0 :     if (v == unicode_empty) {
    6373           0 :         Py_DECREF(v);
    6374           0 :         return (PyObject *)u;
    6375             :     }
    6376           0 :     if (u == unicode_empty) {
    6377           0 :         Py_DECREF(u);
    6378           0 :         return (PyObject *)v;
    6379             :     }
    6380             : 
    6381           0 :     if (u->length > PY_SSIZE_T_MAX - v->length) {
    6382           0 :         PyErr_SetString(PyExc_OverflowError,
    6383             :                         "strings are too large to concat");
    6384           0 :         goto onError;
    6385             :     }
    6386             : 
    6387             :     /* Concat the two Unicode strings */
    6388           0 :     w = _PyUnicode_New(u->length + v->length);
    6389           0 :     if (w == NULL)
    6390           0 :         goto onError;
    6391           0 :     Py_UNICODE_COPY(w->str, u->str, u->length);
    6392           0 :     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
    6393             : 
    6394           0 :     Py_DECREF(u);
    6395           0 :     Py_DECREF(v);
    6396           0 :     return (PyObject *)w;
    6397             : 
    6398             :   onError:
    6399           0 :     Py_XDECREF(u);
    6400           0 :     Py_XDECREF(v);
    6401           0 :     return NULL;
    6402             : }
    6403             : 
    6404             : PyDoc_STRVAR(count__doc__,
    6405             :              "S.count(sub[, start[, end]]) -> int\n\
    6406             : \n\
    6407             : Return the number of non-overlapping occurrences of substring sub in\n\
    6408             : Unicode string S[start:end].  Optional arguments start and end are\n\
    6409             : interpreted as in slice notation.");
    6410             : 
    6411             : static PyObject *
    6412           0 : unicode_count(PyUnicodeObject *self, PyObject *args)
    6413             : {
    6414             :     PyUnicodeObject *substring;
    6415           0 :     Py_ssize_t start = 0;
    6416           0 :     Py_ssize_t end = PY_SSIZE_T_MAX;
    6417             :     PyObject *result;
    6418             : 
    6419           0 :     if (!stringlib_parse_args_finds_unicode("count", args, &substring,
    6420             :                                             &start, &end))
    6421           0 :         return NULL;
    6422             : 
    6423           0 :     ADJUST_INDICES(start, end, self->length);
    6424           0 :     result = PyInt_FromSsize_t(
    6425           0 :         stringlib_count(self->str + start, end - start,
    6426           0 :                         substring->str, substring->length,
    6427             :                         PY_SSIZE_T_MAX)
    6428             :         );
    6429             : 
    6430           0 :     Py_DECREF(substring);
    6431             : 
    6432           0 :     return result;
    6433             : }
    6434             : 
    6435             : PyDoc_STRVAR(encode__doc__,
    6436             :              "S.encode([encoding[,errors]]) -> string or unicode\n\
    6437             : \n\
    6438             : Encodes S using the codec registered for encoding. encoding defaults\n\
    6439             : to the default encoding. errors may be given to set a different error\n\
    6440             : handling scheme. Default is 'strict' meaning that encoding errors raise\n\
    6441             : a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
    6442             : 'xmlcharrefreplace' as well as any other name registered with\n\
    6443             : codecs.register_error that can handle UnicodeEncodeErrors.");
    6444             : 
    6445             : static PyObject *
    6446           0 : unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
    6447             : {
    6448             :     static char *kwlist[] = {"encoding", "errors", 0};
    6449           0 :     char *encoding = NULL;
    6450           0 :     char *errors = NULL;
    6451             :     PyObject *v;
    6452             : 
    6453           0 :     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
    6454             :                                      kwlist, &encoding, &errors))
    6455           0 :         return NULL;
    6456           0 :     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
    6457           0 :     if (v == NULL)
    6458           0 :         goto onError;
    6459           0 :     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
    6460           0 :         PyErr_Format(PyExc_TypeError,
    6461             :                      "encoder did not return a string/unicode object "
    6462             :                      "(type=%.400s)",
    6463           0 :                      Py_TYPE(v)->tp_name);
    6464           0 :         Py_DECREF(v);
    6465           0 :         return NULL;
    6466             :     }
    6467           0 :     return v;
    6468             : 
    6469             :   onError:
    6470           0 :     return NULL;
    6471             : }
    6472             : 
    6473             : PyDoc_STRVAR(decode__doc__,
    6474             :              "S.decode([encoding[,errors]]) -> string or unicode\n\
    6475             : \n\
    6476             : Decodes S using the codec registered for encoding. encoding defaults\n\
    6477             : to the default encoding. errors may be given to set a different error\n\
    6478             : handling scheme. Default is 'strict' meaning that encoding errors raise\n\
    6479             : a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
    6480             : as well as any other name registered with codecs.register_error that is\n\
    6481             : able to handle UnicodeDecodeErrors.");
    6482             : 
    6483             : static PyObject *
    6484           0 : unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
    6485             : {
    6486             :     static char *kwlist[] = {"encoding", "errors", 0};
    6487           0 :     char *encoding = NULL;
    6488           0 :     char *errors = NULL;
    6489             :     PyObject *v;
    6490             : 
    6491           0 :     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
    6492             :                                      kwlist, &encoding, &errors))
    6493           0 :         return NULL;
    6494           0 :     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
    6495           0 :     if (v == NULL)
    6496           0 :         goto onError;
    6497           0 :     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
    6498           0 :         PyErr_Format(PyExc_TypeError,
    6499             :                      "decoder did not return a string/unicode object "
    6500             :                      "(type=%.400s)",
    6501           0 :                      Py_TYPE(v)->tp_name);
    6502           0 :         Py_DECREF(v);
    6503           0 :         return NULL;
    6504             :     }
    6505           0 :     return v;
    6506             : 
    6507             :   onError:
    6508           0 :     return NULL;
    6509             : }
    6510             : 
    6511             : PyDoc_STRVAR(expandtabs__doc__,
    6512             :              "S.expandtabs([tabsize]) -> unicode\n\
    6513             : \n\
    6514             : Return a copy of S where all tab characters are expanded using spaces.\n\
    6515             : If tabsize is not given, a tab size of 8 characters is assumed.");
    6516             : 
    6517             : static PyObject*
    6518           0 : unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
    6519             : {
    6520             :     Py_UNICODE *e;
    6521             :     Py_UNICODE *p;
    6522             :     Py_UNICODE *q;
    6523             :     Py_UNICODE *qe;
    6524             :     Py_ssize_t i, j, incr;
    6525             :     PyUnicodeObject *u;
    6526           0 :     int tabsize = 8;
    6527             : 
    6528           0 :     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
    6529           0 :         return NULL;
    6530             : 
    6531             :     /* First pass: determine size of output string */
    6532           0 :     i = 0; /* chars up to and including most recent \n or \r */
    6533           0 :     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
    6534           0 :     e = self->str + self->length; /* end of input */
    6535           0 :     for (p = self->str; p < e; p++)
    6536           0 :         if (*p == '\t') {
    6537           0 :             if (tabsize > 0) {
    6538           0 :                 incr = tabsize - (j % tabsize); /* cannot overflow */
    6539           0 :                 if (j > PY_SSIZE_T_MAX - incr)
    6540           0 :                     goto overflow1;
    6541           0 :                 j += incr;
    6542             :             }
    6543             :         }
    6544             :         else {
    6545           0 :             if (j > PY_SSIZE_T_MAX - 1)
    6546           0 :                 goto overflow1;
    6547           0 :             j++;
    6548           0 :             if (*p == '\n' || *p == '\r') {
    6549           0 :                 if (i > PY_SSIZE_T_MAX - j)
    6550           0 :                     goto overflow1;
    6551           0 :                 i += j;
    6552           0 :                 j = 0;
    6553             :             }
    6554             :         }
    6555             : 
    6556           0 :     if (i > PY_SSIZE_T_MAX - j)
    6557           0 :         goto overflow1;
    6558             : 
    6559             :     /* Second pass: create output string and fill it */
    6560           0 :     u = _PyUnicode_New(i + j);
    6561           0 :     if (!u)
    6562           0 :         return NULL;
    6563             : 
    6564           0 :     j = 0; /* same as in first pass */
    6565           0 :     q = u->str; /* next output char */
    6566           0 :     qe = u->str + u->length; /* end of output */
    6567             : 
    6568           0 :     for (p = self->str; p < e; p++)
    6569           0 :         if (*p == '\t') {
    6570           0 :             if (tabsize > 0) {
    6571           0 :                 i = tabsize - (j % tabsize);
    6572           0 :                 j += i;
    6573           0 :                 while (i--) {
    6574           0 :                     if (q >= qe)
    6575           0 :                         goto overflow2;
    6576           0 :                     *q++ = ' ';
    6577             :                 }
    6578             :             }
    6579             :         }
    6580             :         else {
    6581           0 :             if (q >= qe)
    6582           0 :                 goto overflow2;
    6583           0 :             *q++ = *p;
    6584           0 :             j++;
    6585           0 :             if (*p == '\n' || *p == '\r')
    6586           0 :                 j = 0;
    6587             :         }
    6588             : 
    6589           0 :     return (PyObject*) u;
    6590             : 
    6591             :   overflow2:
    6592           0 :     Py_DECREF(u);
    6593             :   overflow1:
    6594           0 :     PyErr_SetString(PyExc_OverflowError, "new string is too long");
    6595           0 :     return NULL;
    6596             : }
    6597             : 
    6598             : PyDoc_STRVAR(find__doc__,
    6599             :              "S.find(sub [,start [,end]]) -> int\n\
    6600             : \n\
    6601             : Return the lowest index in S where substring sub is found,\n\
    6602             : such that sub is contained within S[start:end].  Optional\n\
    6603             : arguments start and end are interpreted as in slice notation.\n\
    6604             : \n\
    6605             : Return -1 on failure.");
    6606             : 
    6607             : static PyObject *
    6608           0 : unicode_find(PyUnicodeObject *self, PyObject *args)
    6609             : {
    6610             :     PyUnicodeObject *substring;
    6611             :     Py_ssize_t start;
    6612             :     Py_ssize_t end;
    6613             :     Py_ssize_t result;
    6614             : 
    6615           0 :     if (!stringlib_parse_args_finds_unicode("find", args, &substring,
    6616             :                                             &start, &end))
    6617           0 :         return NULL;
    6618             : 
    6619           0 :     result = stringlib_find_slice(
    6620           0 :         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
    6621           0 :         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
    6622             :         start, end
    6623             :         );
    6624             : 
    6625           0 :     Py_DECREF(substring);
    6626             : 
    6627           0 :     return PyInt_FromSsize_t(result);
    6628             : }
    6629             : 
    6630             : static PyObject *
    6631           0 : unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
    6632             : {
    6633           0 :     if (index < 0 || index >= self->length) {
    6634           0 :         PyErr_SetString(PyExc_IndexError, "string index out of range");
    6635           0 :         return NULL;
    6636             :     }
    6637             : 
    6638           0 :     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
    6639             : }
    6640             : 
    6641             : static long
    6642           0 : unicode_hash(PyUnicodeObject *self)
    6643             : {
    6644             :     /* Since Unicode objects compare equal to their ASCII string
    6645             :        counterparts, they should use the individual character values
    6646             :        as basis for their hash value.  This is needed to assure that
    6647             :        strings and Unicode objects behave in the same way as
    6648             :        dictionary keys. */
    6649             : 
    6650             :     register Py_ssize_t len;
    6651             :     register Py_UNICODE *p;
    6652             :     register long x;
    6653             : 
    6654             : #ifdef Py_DEBUG
    6655             :     assert(_Py_HashSecret_Initialized);
    6656             : #endif
    6657           0 :     if (self->hash != -1)
    6658           0 :         return self->hash;
    6659           0 :     len = PyUnicode_GET_SIZE(self);
    6660             :     /*
    6661             :       We make the hash of the empty string be 0, rather than using
    6662             :       (prefix ^ suffix), since this slightly obfuscates the hash secret
    6663             :     */
    6664           0 :     if (len == 0) {
    6665           0 :         self->hash = 0;
    6666           0 :         return 0;
    6667             :     }
    6668           0 :     p = PyUnicode_AS_UNICODE(self);
    6669           0 :     x = _Py_HashSecret.prefix;
    6670           0 :     x ^= *p << 7;
    6671           0 :     while (--len >= 0)
    6672           0 :         x = (1000003*x) ^ *p++;
    6673           0 :     x ^= PyUnicode_GET_SIZE(self);
    6674           0 :     x ^= _Py_HashSecret.suffix;
    6675           0 :     if (x == -1)
    6676           0 :         x = -2;
    6677           0 :     self->hash = x;
    6678           0 :     return x;
    6679             : }
    6680             : 
    6681             : PyDoc_STRVAR(index__doc__,
    6682             :              "S.index(sub [,start [,end]]) -> int\n\
    6683             : \n\
    6684             : Like S.find() but raise ValueError when the substring is not found.");
    6685             : 
    6686             : static PyObject *
    6687           0 : unicode_index(PyUnicodeObject *self, PyObject *args)
    6688             : {
    6689             :     Py_ssize_t result;
    6690             :     PyUnicodeObject *substring;
    6691             :     Py_ssize_t start;
    6692             :     Py_ssize_t end;
    6693             : 
    6694           0 :     if (!stringlib_parse_args_finds_unicode("index", args, &substring,
    6695             :                                             &start, &end))
    6696           0 :         return NULL;
    6697             : 
    6698           0 :     result = stringlib_find_slice(
    6699           0 :         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
    6700           0 :         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
    6701             :         start, end
    6702             :         );
    6703             : 
    6704           0 :     Py_DECREF(substring);
    6705             : 
    6706           0 :     if (result < 0) {
    6707           0 :         PyErr_SetString(PyExc_ValueError, "substring not found");
    6708           0 :         return NULL;
    6709             :     }
    6710             : 
    6711           0 :     return PyInt_FromSsize_t(result);
    6712             : }
    6713             : 
    6714             : PyDoc_STRVAR(islower__doc__,
    6715             :              "S.islower() -> bool\n\
    6716             : \n\
    6717             : Return True if all cased characters in S are lowercase and there is\n\
    6718             : at least one cased character in S, False otherwise.");
    6719             : 
    6720             : static PyObject*
    6721           0 : unicode_islower(PyUnicodeObject *self)
    6722             : {
    6723           0 :     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    6724             :     register const Py_UNICODE *e;
    6725             :     int cased;
    6726             : 
    6727             :     /* Shortcut for single character strings */
    6728           0 :     if (PyUnicode_GET_SIZE(self) == 1)
    6729           0 :         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
    6730             : 
    6731             :     /* Special case for empty strings */
    6732           0 :     if (PyUnicode_GET_SIZE(self) == 0)
    6733           0 :         return PyBool_FromLong(0);
    6734             : 
    6735           0 :     e = p + PyUnicode_GET_SIZE(self);
    6736           0 :     cased = 0;
    6737           0 :     for (; p < e; p++) {
    6738           0 :         register const Py_UNICODE ch = *p;
    6739             : 
    6740           0 :         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
    6741           0 :             return PyBool_FromLong(0);
    6742           0 :         else if (!cased && Py_UNICODE_ISLOWER(ch))
    6743           0 :             cased = 1;
    6744             :     }
    6745           0 :     return PyBool_FromLong(cased);
    6746             : }
    6747             : 
    6748             : PyDoc_STRVAR(isupper__doc__,
    6749             :              "S.isupper() -> bool\n\
    6750             : \n\
    6751             : Return True if all cased characters in S are uppercase and there is\n\
    6752             : at least one cased character in S, False otherwise.");
    6753             : 
    6754             : static PyObject*
    6755           0 : unicode_isupper(PyUnicodeObject *self)
    6756             : {
    6757           0 :     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    6758             :     register const Py_UNICODE *e;
    6759             :     int cased;
    6760             : 
    6761             :     /* Shortcut for single character strings */
    6762           0 :     if (PyUnicode_GET_SIZE(self) == 1)
    6763           0 :         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
    6764             : 
    6765             :     /* Special case for empty strings */
    6766           0 :     if (PyUnicode_GET_SIZE(self) == 0)
    6767           0 :         return PyBool_FromLong(0);
    6768             : 
    6769           0 :     e = p + PyUnicode_GET_SIZE(self);
    6770           0 :     cased = 0;
    6771           0 :     for (; p < e; p++) {
    6772           0 :         register const Py_UNICODE ch = *p;
    6773             : 
    6774           0 :         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
    6775           0 :             return PyBool_FromLong(0);
    6776           0 :         else if (!cased && Py_UNICODE_ISUPPER(ch))
    6777           0 :             cased = 1;
    6778             :     }
    6779           0 :     return PyBool_FromLong(cased);
    6780             : }
    6781             : 
    6782             : PyDoc_STRVAR(istitle__doc__,
    6783             :              "S.istitle() -> bool\n\
    6784             : \n\
    6785             : Return True if S is a titlecased string and there is at least one\n\
    6786             : character in S, i.e. upper- and titlecase characters may only\n\
    6787             : follow uncased characters and lowercase characters only cased ones.\n\
    6788             : Return False otherwise.");
    6789             : 
    6790             : static PyObject*
    6791           0 : unicode_istitle(PyUnicodeObject *self)
    6792             : {
    6793           0 :     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    6794             :     register const Py_UNICODE *e;
    6795             :     int cased, previous_is_cased;
    6796             : 
    6797             :     /* Shortcut for single character strings */
    6798           0 :     if (PyUnicode_GET_SIZE(self) == 1)
    6799           0 :         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
    6800           0 :                                (Py_UNICODE_ISUPPER(*p) != 0));
    6801             : 
    6802             :     /* Special case for empty strings */
    6803           0 :     if (PyUnicode_GET_SIZE(self) == 0)
    6804           0 :         return PyBool_FromLong(0);
    6805             : 
    6806           0 :     e = p + PyUnicode_GET_SIZE(self);
    6807           0 :     cased = 0;
    6808           0 :     previous_is_cased = 0;
    6809           0 :     for (; p < e; p++) {
    6810           0 :         register const Py_UNICODE ch = *p;
    6811             : 
    6812           0 :         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
    6813           0 :             if (previous_is_cased)
    6814           0 :                 return PyBool_FromLong(0);
    6815           0 :             previous_is_cased = 1;
    6816           0 :             cased = 1;
    6817             :         }
    6818           0 :         else if (Py_UNICODE_ISLOWER(ch)) {
    6819           0 :             if (!previous_is_cased)
    6820           0 :                 return PyBool_FromLong(0);
    6821           0 :             previous_is_cased = 1;
    6822           0 :             cased = 1;
    6823             :         }
    6824             :         else
    6825           0 :             previous_is_cased = 0;
    6826             :     }
    6827           0 :     return PyBool_FromLong(cased);
    6828             : }
    6829             : 
    6830             : PyDoc_STRVAR(isspace__doc__,
    6831             :              "S.isspace() -> bool\n\
    6832             : \n\
    6833             : Return True if all characters in S are whitespace\n\
    6834             : and there is at least one character in S, False otherwise.");
    6835             : 
    6836             : static PyObject*
    6837           0 : unicode_isspace(PyUnicodeObject *self)
    6838             : {
    6839           0 :     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    6840             :     register const Py_UNICODE *e;
    6841             : 
    6842             :     /* Shortcut for single character strings */
    6843           0 :     if (PyUnicode_GET_SIZE(self) == 1 &&
    6844           0 :         Py_UNICODE_ISSPACE(*p))
    6845           0 :         return PyBool_FromLong(1);
    6846             : 
    6847             :     /* Special case for empty strings */
    6848           0 :     if (PyUnicode_GET_SIZE(self) == 0)
    6849           0 :         return PyBool_FromLong(0);
    6850             : 
    6851           0 :     e = p + PyUnicode_GET_SIZE(self);
    6852           0 :     for (; p < e; p++) {
    6853           0 :         if (!Py_UNICODE_ISSPACE(*p))
    6854           0 :             return PyBool_FromLong(0);
    6855             :     }
    6856           0 :     return PyBool_FromLong(1);
    6857             : }
    6858             : 
    6859             : PyDoc_STRVAR(isalpha__doc__,
    6860             :              "S.isalpha() -> bool\n\
    6861             : \n\
    6862             : Return True if all characters in S are alphabetic\n\
    6863             : and there is at least one character in S, False otherwise.");
    6864             : 
    6865             : static PyObject*
    6866           0 : unicode_isalpha(PyUnicodeObject *self)
    6867             : {
    6868           0 :     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    6869             :     register const Py_UNICODE *e;
    6870             : 
    6871             :     /* Shortcut for single character strings */
    6872           0 :     if (PyUnicode_GET_SIZE(self) == 1 &&
    6873           0 :         Py_UNICODE_ISALPHA(*p))
    6874           0 :         return PyBool_FromLong(1);
    6875             : 
    6876             :     /* Special case for empty strings */
    6877           0 :     if (PyUnicode_GET_SIZE(self) == 0)
    6878           0 :         return PyBool_FromLong(0);
    6879             : 
    6880           0 :     e = p + PyUnicode_GET_SIZE(self);
    6881           0 :     for (; p < e; p++) {
    6882           0 :         if (!Py_UNICODE_ISALPHA(*p))
    6883           0 :             return PyBool_FromLong(0);
    6884             :     }
    6885           0 :     return PyBool_FromLong(1);
    6886             : }
    6887             : 
    6888             : PyDoc_STRVAR(isalnum__doc__,
    6889             :              "S.isalnum() -> bool\n\
    6890             : \n\
    6891             : Return True if all characters in S are alphanumeric\n\
    6892             : and there is at least one character in S, False otherwise.");
    6893             : 
    6894             : static PyObject*
    6895           0 : unicode_isalnum(PyUnicodeObject *self)
    6896             : {
    6897           0 :     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    6898             :     register const Py_UNICODE *e;
    6899             : 
    6900             :     /* Shortcut for single character strings */
    6901           0 :     if (PyUnicode_GET_SIZE(self) == 1 &&
    6902           0 :         Py_UNICODE_ISALNUM(*p))
    6903           0 :         return PyBool_FromLong(1);
    6904             : 
    6905             :     /* Special case for empty strings */
    6906           0 :     if (PyUnicode_GET_SIZE(self) == 0)
    6907           0 :         return PyBool_FromLong(0);
    6908             : 
    6909           0 :     e = p + PyUnicode_GET_SIZE(self);
    6910           0 :     for (; p < e; p++) {
    6911           0 :         if (!Py_UNICODE_ISALNUM(*p))
    6912           0 :             return PyBool_FromLong(0);
    6913             :     }
    6914           0 :     return PyBool_FromLong(1);
    6915             : }
    6916             : 
    6917             : PyDoc_STRVAR(isdecimal__doc__,
    6918             :              "S.isdecimal() -> bool\n\
    6919             : \n\
    6920             : Return True if there are only decimal characters in S,\n\
    6921             : False otherwise.");
    6922             : 
    6923             : static PyObject*
    6924           0 : unicode_isdecimal(PyUnicodeObject *self)
    6925             : {
    6926           0 :     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    6927             :     register const Py_UNICODE *e;
    6928             : 
    6929             :     /* Shortcut for single character strings */
    6930           0 :     if (PyUnicode_GET_SIZE(self) == 1 &&
    6931           0 :         Py_UNICODE_ISDECIMAL(*p))
    6932           0 :         return PyBool_FromLong(1);
    6933             : 
    6934             :     /* Special case for empty strings */
    6935           0 :     if (PyUnicode_GET_SIZE(self) == 0)
    6936           0 :         return PyBool_FromLong(0);
    6937             : 
    6938           0 :     e = p + PyUnicode_GET_SIZE(self);
    6939           0 :     for (; p < e; p++) {
    6940           0 :         if (!Py_UNICODE_ISDECIMAL(*p))
    6941           0 :             return PyBool_FromLong(0);
    6942             :     }
    6943           0 :     return PyBool_FromLong(1);
    6944             : }
    6945             : 
    6946             : PyDoc_STRVAR(isdigit__doc__,
    6947             :              "S.isdigit() -> bool\n\
    6948             : \n\
    6949             : Return True if all characters in S are digits\n\
    6950             : and there is at least one character in S, False otherwise.");
    6951             : 
    6952             : static PyObject*
    6953           0 : unicode_isdigit(PyUnicodeObject *self)
    6954             : {
    6955           0 :     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    6956             :     register const Py_UNICODE *e;
    6957             : 
    6958             :     /* Shortcut for single character strings */
    6959           0 :     if (PyUnicode_GET_SIZE(self) == 1 &&
    6960           0 :         Py_UNICODE_ISDIGIT(*p))
    6961           0 :         return PyBool_FromLong(1);
    6962             : 
    6963             :     /* Special case for empty strings */
    6964           0 :     if (PyUnicode_GET_SIZE(self) == 0)
    6965           0 :         return PyBool_FromLong(0);
    6966             : 
    6967           0 :     e = p + PyUnicode_GET_SIZE(self);
    6968           0 :     for (; p < e; p++) {
    6969           0 :         if (!Py_UNICODE_ISDIGIT(*p))
    6970           0 :             return PyBool_FromLong(0);
    6971             :     }
    6972           0 :     return PyBool_FromLong(1);
    6973             : }
    6974             : 
    6975             : PyDoc_STRVAR(isnumeric__doc__,
    6976             :              "S.isnumeric() -> bool\n\
    6977             : \n\
    6978             : Return True if there are only numeric characters in S,\n\
    6979             : False otherwise.");
    6980             : 
    6981             : static PyObject*
    6982           0 : unicode_isnumeric(PyUnicodeObject *self)
    6983             : {
    6984           0 :     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    6985             :     register const Py_UNICODE *e;
    6986             : 
    6987             :     /* Shortcut for single character strings */
    6988           0 :     if (PyUnicode_GET_SIZE(self) == 1 &&
    6989           0 :         Py_UNICODE_ISNUMERIC(*p))
    6990           0 :         return PyBool_FromLong(1);
    6991             : 
    6992             :     /* Special case for empty strings */
    6993           0 :     if (PyUnicode_GET_SIZE(self) == 0)
    6994           0 :         return PyBool_FromLong(0);
    6995             : 
    6996           0 :     e = p + PyUnicode_GET_SIZE(self);
    6997           0 :     for (; p < e; p++) {
    6998           0 :         if (!Py_UNICODE_ISNUMERIC(*p))
    6999           0 :             return PyBool_FromLong(0);
    7000             :     }
    7001           0 :     return PyBool_FromLong(1);
    7002             : }
    7003             : 
    7004             : PyDoc_STRVAR(join__doc__,
    7005             :              "S.join(iterable) -> unicode\n\
    7006             : \n\
    7007             : Return a string which is the concatenation of the strings in the\n\
    7008             : iterable.  The separator between elements is S.");
    7009             : 
    7010             : static PyObject*
    7011           0 : unicode_join(PyObject *self, PyObject *data)
    7012             : {
    7013           0 :     return PyUnicode_Join(self, data);
    7014             : }
    7015             : 
    7016             : static Py_ssize_t
    7017           0 : unicode_length(PyUnicodeObject *self)
    7018             : {
    7019           0 :     return self->length;
    7020             : }
    7021             : 
    7022             : PyDoc_STRVAR(ljust__doc__,
    7023             :              "S.ljust(width[, fillchar]) -> int\n\
    7024             : \n\
    7025             : Return S left-justified in a Unicode string of length width. Padding is\n\
    7026             : done using the specified fill character (default is a space).");
    7027             : 
    7028             : static PyObject *
    7029           0 : unicode_ljust(PyUnicodeObject *self, PyObject *args)
    7030             : {
    7031             :     Py_ssize_t width;
    7032           0 :     Py_UNICODE fillchar = ' ';
    7033             : 
    7034           0 :     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
    7035           0 :         return NULL;
    7036             : 
    7037           0 :     if (self->length >= width && PyUnicode_CheckExact(self)) {
    7038           0 :         Py_INCREF(self);
    7039           0 :         return (PyObject*) self;
    7040             :     }
    7041             : 
    7042           0 :     return (PyObject*) pad(self, 0, width - self->length, fillchar);
    7043             : }
    7044             : 
    7045             : PyDoc_STRVAR(lower__doc__,
    7046             :              "S.lower() -> unicode\n\
    7047             : \n\
    7048             : Return a copy of the string S converted to lowercase.");
    7049             : 
    7050             : static PyObject*
    7051           0 : unicode_lower(PyUnicodeObject *self)
    7052             : {
    7053           0 :     return fixup(self, fixlower);
    7054             : }
    7055             : 
    7056             : #define LEFTSTRIP 0
    7057             : #define RIGHTSTRIP 1
    7058             : #define BOTHSTRIP 2
    7059             : 
    7060             : /* Arrays indexed by above */
    7061             : static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
    7062             : 
    7063             : #define STRIPNAME(i) (stripformat[i]+3)
    7064             : 
    7065             : /* externally visible for str.strip(unicode) */
    7066             : PyObject *
    7067           0 : _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
    7068             : {
    7069           0 :     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
    7070           0 :     Py_ssize_t len = PyUnicode_GET_SIZE(self);
    7071           0 :     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
    7072           0 :     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
    7073             :     Py_ssize_t i, j;
    7074             : 
    7075           0 :     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
    7076             : 
    7077           0 :     i = 0;
    7078           0 :     if (striptype != RIGHTSTRIP) {
    7079           0 :         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
    7080           0 :             i++;
    7081             :         }
    7082             :     }
    7083             : 
    7084           0 :     j = len;
    7085           0 :     if (striptype != LEFTSTRIP) {
    7086             :         do {
    7087           0 :             j--;
    7088           0 :         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
    7089           0 :         j++;
    7090             :     }
    7091             : 
    7092           0 :     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
    7093           0 :         Py_INCREF(self);
    7094           0 :         return (PyObject*)self;
    7095             :     }
    7096             :     else
    7097           0 :         return PyUnicode_FromUnicode(s+i, j-i);
    7098             : }
    7099             : 
    7100             : 
    7101             : static PyObject *
    7102           0 : do_strip(PyUnicodeObject *self, int striptype)
    7103             : {
    7104           0 :     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
    7105           0 :     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
    7106             : 
    7107           0 :     i = 0;
    7108           0 :     if (striptype != RIGHTSTRIP) {
    7109           0 :         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
    7110           0 :             i++;
    7111             :         }
    7112             :     }
    7113             : 
    7114           0 :     j = len;
    7115           0 :     if (striptype != LEFTSTRIP) {
    7116             :         do {
    7117           0 :             j--;
    7118           0 :         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
    7119           0 :         j++;
    7120             :     }
    7121             : 
    7122           0 :     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
    7123           0 :         Py_INCREF(self);
    7124           0 :         return (PyObject*)self;
    7125             :     }
    7126             :     else
    7127           0 :         return PyUnicode_FromUnicode(s+i, j-i);
    7128             : }
    7129             : 
    7130             : 
    7131             : static PyObject *
    7132           0 : do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
    7133             : {
    7134           0 :     PyObject *sep = NULL;
    7135             : 
    7136           0 :     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
    7137           0 :         return NULL;
    7138             : 
    7139           0 :     if (sep != NULL && sep != Py_None) {
    7140           0 :         if (PyUnicode_Check(sep))
    7141           0 :             return _PyUnicode_XStrip(self, striptype, sep);
    7142           0 :         else if (PyString_Check(sep)) {
    7143             :             PyObject *res;
    7144           0 :             sep = PyUnicode_FromObject(sep);
    7145           0 :             if (sep==NULL)
    7146           0 :                 return NULL;
    7147           0 :             res = _PyUnicode_XStrip(self, striptype, sep);
    7148           0 :             Py_DECREF(sep);
    7149           0 :             return res;
    7150             :         }
    7151             :         else {
    7152           0 :             PyErr_Format(PyExc_TypeError,
    7153             :                          "%s arg must be None, unicode or str",
    7154           0 :                          STRIPNAME(striptype));
    7155           0 :             return NULL;
    7156             :         }
    7157             :     }
    7158             : 
    7159           0 :     return do_strip(self, striptype);
    7160             : }
    7161             : 
    7162             : 
    7163             : PyDoc_STRVAR(strip__doc__,
    7164             :              "S.strip([chars]) -> unicode\n\
    7165             : \n\
    7166             : Return a copy of the string S with leading and trailing\n\
    7167             : whitespace removed.\n\
    7168             : If chars is given and not None, remove characters in chars instead.\n\
    7169             : If chars is a str, it will be converted to unicode before stripping");
    7170             : 
    7171             : static PyObject *
    7172           0 : unicode_strip(PyUnicodeObject *self, PyObject *args)
    7173             : {
    7174           0 :     if (PyTuple_GET_SIZE(args) == 0)
    7175           0 :         return do_strip(self, BOTHSTRIP); /* Common case */
    7176             :     else
    7177           0 :         return do_argstrip(self, BOTHSTRIP, args);
    7178             : }
    7179             : 
    7180             : 
    7181             : PyDoc_STRVAR(lstrip__doc__,
    7182             :              "S.lstrip([chars]) -> unicode\n\
    7183             : \n\
    7184             : Return a copy of the string S with leading whitespace removed.\n\
    7185             : If chars is given and not None, remove characters in chars instead.\n\
    7186             : If chars is a str, it will be converted to unicode before stripping");
    7187             : 
    7188             : static PyObject *
    7189           0 : unicode_lstrip(PyUnicodeObject *self, PyObject *args)
    7190             : {
    7191           0 :     if (PyTuple_GET_SIZE(args) == 0)
    7192           0 :         return do_strip(self, LEFTSTRIP); /* Common case */
    7193             :     else
    7194           0 :         return do_argstrip(self, LEFTSTRIP, args);
    7195             : }
    7196             : 
    7197             : 
    7198             : PyDoc_STRVAR(rstrip__doc__,
    7199             :              "S.rstrip([chars]) -> unicode\n\
    7200             : \n\
    7201             : Return a copy of the string S with trailing whitespace removed.\n\
    7202             : If chars is given and not None, remove characters in chars instead.\n\
    7203             : If chars is a str, it will be converted to unicode before stripping");
    7204             : 
    7205             : static PyObject *
    7206           0 : unicode_rstrip(PyUnicodeObject *self, PyObject *args)
    7207             : {
    7208           0 :     if (PyTuple_GET_SIZE(args) == 0)
    7209           0 :         return do_strip(self, RIGHTSTRIP); /* Common case */
    7210             :     else
    7211           0 :         return do_argstrip(self, RIGHTSTRIP, args);
    7212             : }
    7213             : 
    7214             : 
    7215             : static PyObject*
    7216           0 : unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
    7217             : {
    7218             :     PyUnicodeObject *u;
    7219             :     Py_UNICODE *p;
    7220             :     Py_ssize_t nchars;
    7221             :     size_t nbytes;
    7222             : 
    7223           0 :     if (len < 0)
    7224           0 :         len = 0;
    7225             : 
    7226           0 :     if (len == 1 && PyUnicode_CheckExact(str)) {
    7227             :         /* no repeat, return original string */
    7228           0 :         Py_INCREF(str);
    7229           0 :         return (PyObject*) str;
    7230             :     }
    7231             : 
    7232             :     /* ensure # of chars needed doesn't overflow Py_ssize_t and # of bytes
    7233             :      * needed doesn't overflow size_t
    7234             :      */
    7235           0 :     if (len && str->length > PY_SSIZE_T_MAX / len) {
    7236           0 :         PyErr_SetString(PyExc_OverflowError,
    7237             :                         "repeated string is too long");
    7238           0 :         return NULL;
    7239             :     }
    7240           0 :     nchars = len * str->length;
    7241           0 :     nbytes = ((size_t)nchars + 1u) * sizeof(Py_UNICODE);
    7242           0 :     if (nbytes / sizeof(Py_UNICODE) != ((size_t)nchars + 1u)) {
    7243           0 :         PyErr_SetString(PyExc_OverflowError,
    7244             :                         "repeated string is too long");
    7245           0 :         return NULL;
    7246             :     }
    7247           0 :     u = _PyUnicode_New(nchars);
    7248           0 :     if (!u)
    7249           0 :         return NULL;
    7250             : 
    7251           0 :     p = u->str;
    7252             : 
    7253           0 :     if (str->length == 1 && len > 0) {
    7254           0 :         Py_UNICODE_FILL(p, str->str[0], len);
    7255             :     } else {
    7256           0 :         Py_ssize_t done = 0; /* number of characters copied this far */
    7257           0 :         if (done < nchars) {
    7258           0 :             Py_UNICODE_COPY(p, str->str, str->length);
    7259           0 :             done = str->length;
    7260             :         }
    7261           0 :         while (done < nchars) {
    7262           0 :             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
    7263           0 :             Py_UNICODE_COPY(p+done, p, n);
    7264           0 :             done += n;
    7265             :         }
    7266             :     }
    7267             : 
    7268           0 :     return (PyObject*) u;
    7269             : }
    7270             : 
    7271           0 : PyObject *PyUnicode_Replace(PyObject *obj,
    7272             :                             PyObject *subobj,
    7273             :                             PyObject *replobj,
    7274             :                             Py_ssize_t maxcount)
    7275             : {
    7276             :     PyObject *self;
    7277             :     PyObject *str1;
    7278             :     PyObject *str2;
    7279             :     PyObject *result;
    7280             : 
    7281           0 :     self = PyUnicode_FromObject(obj);
    7282           0 :     if (self == NULL)
    7283           0 :         return NULL;
    7284           0 :     str1 = PyUnicode_FromObject(subobj);
    7285           0 :     if (str1 == NULL) {
    7286           0 :         Py_DECREF(self);
    7287           0 :         return NULL;
    7288             :     }
    7289           0 :     str2 = PyUnicode_FromObject(replobj);
    7290           0 :     if (str2 == NULL) {
    7291           0 :         Py_DECREF(self);
    7292           0 :         Py_DECREF(str1);
    7293           0 :         return NULL;
    7294             :     }
    7295           0 :     result = replace((PyUnicodeObject *)self,
    7296             :                      (PyUnicodeObject *)str1,
    7297             :                      (PyUnicodeObject *)str2,
    7298             :                      maxcount);
    7299           0 :     Py_DECREF(self);
    7300           0 :     Py_DECREF(str1);
    7301           0 :     Py_DECREF(str2);
    7302           0 :     return result;
    7303             : }
    7304             : 
    7305             : PyDoc_STRVAR(replace__doc__,
    7306             :              "S.replace(old, new[, count]) -> unicode\n\
    7307             : \n\
    7308             : Return a copy of S with all occurrences of substring\n\
    7309             : old replaced by new.  If the optional argument count is\n\
    7310             : given, only the first count occurrences are replaced.");
    7311             : 
    7312             : static PyObject*
    7313           0 : unicode_replace(PyUnicodeObject *self, PyObject *args)
    7314             : {
    7315             :     PyUnicodeObject *str1;
    7316             :     PyUnicodeObject *str2;
    7317           0 :     Py_ssize_t maxcount = -1;
    7318             :     PyObject *result;
    7319             : 
    7320           0 :     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
    7321           0 :         return NULL;
    7322           0 :     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
    7323           0 :     if (str1 == NULL)
    7324           0 :         return NULL;
    7325           0 :     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
    7326           0 :     if (str2 == NULL) {
    7327           0 :         Py_DECREF(str1);
    7328           0 :         return NULL;
    7329             :     }
    7330             : 
    7331           0 :     result = replace(self, str1, str2, maxcount);
    7332             : 
    7333           0 :     Py_DECREF(str1);
    7334           0 :     Py_DECREF(str2);
    7335           0 :     return result;
    7336             : }
    7337             : 
    7338             : static
    7339           0 : PyObject *unicode_repr(PyObject *unicode)
    7340             : {
    7341           0 :     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
    7342             :                                 PyUnicode_GET_SIZE(unicode),
    7343             :                                 1);
    7344             : }
    7345             : 
    7346             : PyDoc_STRVAR(rfind__doc__,
    7347             :              "S.rfind(sub [,start [,end]]) -> int\n\
    7348             : \n\
    7349             : Return the highest index in S where substring sub is found,\n\
    7350             : such that sub is contained within S[start:end].  Optional\n\
    7351             : arguments start and end are interpreted as in slice notation.\n\
    7352             : \n\
    7353             : Return -1 on failure.");
    7354             : 
    7355             : static PyObject *
    7356           0 : unicode_rfind(PyUnicodeObject *self, PyObject *args)
    7357             : {
    7358             :     PyUnicodeObject *substring;
    7359             :     Py_ssize_t start;
    7360             :     Py_ssize_t end;
    7361             :     Py_ssize_t result;
    7362             : 
    7363           0 :     if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
    7364             :                                             &start, &end))
    7365           0 :         return NULL;
    7366             : 
    7367           0 :     result = stringlib_rfind_slice(
    7368           0 :         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
    7369           0 :         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
    7370             :         start, end
    7371             :         );
    7372             : 
    7373           0 :     Py_DECREF(substring);
    7374             : 
    7375           0 :     return PyInt_FromSsize_t(result);
    7376             : }
    7377             : 
    7378             : PyDoc_STRVAR(rindex__doc__,
    7379             :              "S.rindex(sub [,start [,end]]) -> int\n\
    7380             : \n\
    7381             : Like S.rfind() but raise ValueError when the substring is not found.");
    7382             : 
    7383             : static PyObject *
    7384           0 : unicode_rindex(PyUnicodeObject *self, PyObject *args)
    7385             : {
    7386             :     PyUnicodeObject *substring;
    7387             :     Py_ssize_t start;
    7388             :     Py_ssize_t end;
    7389             :     Py_ssize_t result;
    7390             : 
    7391           0 :     if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
    7392             :                                             &start, &end))
    7393           0 :         return NULL;
    7394             : 
    7395           0 :     result = stringlib_rfind_slice(
    7396           0 :         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
    7397           0 :         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
    7398             :         start, end
    7399             :         );
    7400             : 
    7401           0 :     Py_DECREF(substring);
    7402             : 
    7403           0 :     if (result < 0) {
    7404           0 :         PyErr_SetString(PyExc_ValueError, "substring not found");
    7405           0 :         return NULL;
    7406             :     }
    7407           0 :     return PyInt_FromSsize_t(result);
    7408             : }
    7409             : 
    7410             : PyDoc_STRVAR(rjust__doc__,
    7411             :              "S.rjust(width[, fillchar]) -> unicode\n\
    7412             : \n\
    7413             : Return S right-justified in a Unicode string of length width. Padding is\n\
    7414             : done using the specified fill character (default is a space).");
    7415             : 
    7416             : static PyObject *
    7417           0 : unicode_rjust(PyUnicodeObject *self, PyObject *args)
    7418             : {
    7419             :     Py_ssize_t width;
    7420           0 :     Py_UNICODE fillchar = ' ';
    7421             : 
    7422           0 :     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
    7423           0 :         return NULL;
    7424             : 
    7425           0 :     if (self->length >= width && PyUnicode_CheckExact(self)) {
    7426           0 :         Py_INCREF(self);
    7427           0 :         return (PyObject*) self;
    7428             :     }
    7429             : 
    7430           0 :     return (PyObject*) pad(self, width - self->length, 0, fillchar);
    7431             : }
    7432             : 
    7433             : static PyObject*
    7434           0 : unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
    7435             : {
    7436             :     /* standard clamping */
    7437           0 :     if (start < 0)
    7438           0 :         start = 0;
    7439           0 :     if (end < 0)
    7440           0 :         end = 0;
    7441           0 :     if (end > self->length)
    7442           0 :         end = self->length;
    7443           0 :     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
    7444             :         /* full slice, return original string */
    7445           0 :         Py_INCREF(self);
    7446           0 :         return (PyObject*) self;
    7447             :     }
    7448           0 :     if (start > end)
    7449           0 :         start = end;
    7450             :     /* copy slice */
    7451           0 :     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
    7452             :                                              end - start);
    7453             : }
    7454             : 
    7455           0 : PyObject *PyUnicode_Split(PyObject *s,
    7456             :                           PyObject *sep,
    7457             :                           Py_ssize_t maxsplit)
    7458             : {
    7459             :     PyObject *result;
    7460             : 
    7461           0 :     s = PyUnicode_FromObject(s);
    7462           0 :     if (s == NULL)
    7463           0 :         return NULL;
    7464           0 :     if (sep != NULL) {
    7465           0 :         sep = PyUnicode_FromObject(sep);
    7466           0 :         if (sep == NULL) {
    7467           0 :             Py_DECREF(s);
    7468           0 :             return NULL;
    7469             :         }
    7470             :     }
    7471             : 
    7472           0 :     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
    7473             : 
    7474           0 :     Py_DECREF(s);
    7475           0 :     Py_XDECREF(sep);
    7476           0 :     return result;
    7477             : }
    7478             : 
    7479             : PyDoc_STRVAR(split__doc__,
    7480             :              "S.split([sep [,maxsplit]]) -> list of strings\n\
    7481             : \n\
    7482             : Return a list of the words in S, using sep as the\n\
    7483             : delimiter string.  If maxsplit is given, at most maxsplit\n\
    7484             : splits are done. If sep is not specified or is None, any\n\
    7485             : whitespace string is a separator and empty strings are\n\
    7486             : removed from the result.");
    7487             : 
    7488             : static PyObject*
    7489           0 : unicode_split(PyUnicodeObject *self, PyObject *args)
    7490             : {
    7491           0 :     PyObject *substring = Py_None;
    7492           0 :     Py_ssize_t maxcount = -1;
    7493             : 
    7494           0 :     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
    7495           0 :         return NULL;
    7496             : 
    7497           0 :     if (substring == Py_None)
    7498           0 :         return split(self, NULL, maxcount);
    7499           0 :     else if (PyUnicode_Check(substring))
    7500           0 :         return split(self, (PyUnicodeObject *)substring, maxcount);
    7501             :     else
    7502           0 :         return PyUnicode_Split((PyObject *)self, substring, maxcount);
    7503             : }
    7504             : 
    7505             : PyObject *
    7506           0 : PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
    7507             : {
    7508             :     PyObject* str_obj;
    7509             :     PyObject* sep_obj;
    7510             :     PyObject* out;
    7511             : 
    7512           0 :     str_obj = PyUnicode_FromObject(str_in);
    7513           0 :     if (!str_obj)
    7514           0 :         return NULL;
    7515           0 :     sep_obj = PyUnicode_FromObject(sep_in);
    7516           0 :     if (!sep_obj) {
    7517           0 :         Py_DECREF(str_obj);
    7518           0 :         return NULL;
    7519             :     }
    7520             : 
    7521           0 :     out = stringlib_partition(
    7522           0 :         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
    7523           0 :         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
    7524             :         );
    7525             : 
    7526           0 :     Py_DECREF(sep_obj);
    7527           0 :     Py_DECREF(str_obj);
    7528             : 
    7529           0 :     return out;
    7530             : }
    7531             : 
    7532             : 
    7533             : PyObject *
    7534           0 : PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
    7535             : {
    7536             :     PyObject* str_obj;
    7537             :     PyObject* sep_obj;
    7538             :     PyObject* out;
    7539             : 
    7540           0 :     str_obj = PyUnicode_FromObject(str_in);
    7541           0 :     if (!str_obj)
    7542           0 :         return NULL;
    7543           0 :     sep_obj = PyUnicode_FromObject(sep_in);
    7544           0 :     if (!sep_obj) {
    7545           0 :         Py_DECREF(str_obj);
    7546           0 :         return NULL;
    7547             :     }
    7548             : 
    7549           0 :     out = stringlib_rpartition(
    7550           0 :         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
    7551           0 :         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
    7552             :         );
    7553             : 
    7554           0 :     Py_DECREF(sep_obj);
    7555           0 :     Py_DECREF(str_obj);
    7556             : 
    7557           0 :     return out;
    7558             : }
    7559             : 
    7560             : PyDoc_STRVAR(partition__doc__,
    7561             :              "S.partition(sep) -> (head, sep, tail)\n\
    7562             : \n\
    7563             : Search for the separator sep in S, and return the part before it,\n\
    7564             : the separator itself, and the part after it.  If the separator is not\n\
    7565             : found, return S and two empty strings.");
    7566             : 
    7567             : static PyObject*
    7568           0 : unicode_partition(PyUnicodeObject *self, PyObject *separator)
    7569             : {
    7570           0 :     return PyUnicode_Partition((PyObject *)self, separator);
    7571             : }
    7572             : 
    7573             : PyDoc_STRVAR(rpartition__doc__,
    7574             :              "S.rpartition(sep) -> (head, sep, tail)\n\
    7575             : \n\
    7576             : Search for the separator sep in S, starting at the end of S, and return\n\
    7577             : the part before it, the separator itself, and the part after it.  If the\n\
    7578             : separator is not found, return two empty strings and S.");
    7579             : 
    7580             : static PyObject*
    7581           0 : unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
    7582             : {
    7583           0 :     return PyUnicode_RPartition((PyObject *)self, separator);
    7584             : }
    7585             : 
    7586           0 : PyObject *PyUnicode_RSplit(PyObject *s,
    7587             :                            PyObject *sep,
    7588             :                            Py_ssize_t maxsplit)
    7589             : {
    7590             :     PyObject *result;
    7591             : 
    7592           0 :     s = PyUnicode_FromObject(s);
    7593           0 :     if (s == NULL)
    7594           0 :         return NULL;
    7595           0 :     if (sep != NULL) {
    7596           0 :         sep = PyUnicode_FromObject(sep);
    7597           0 :         if (sep == NULL) {
    7598           0 :             Py_DECREF(s);
    7599           0 :             return NULL;
    7600             :         }
    7601             :     }
    7602             : 
    7603           0 :     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
    7604             : 
    7605           0 :     Py_DECREF(s);
    7606           0 :     Py_XDECREF(sep);
    7607           0 :     return result;
    7608             : }
    7609             : 
    7610             : PyDoc_STRVAR(rsplit__doc__,
    7611             :              "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
    7612             : \n\
    7613             : Return a list of the words in S, using sep as the\n\
    7614             : delimiter string, starting at the end of the string and\n\
    7615             : working to the front.  If maxsplit is given, at most maxsplit\n\
    7616             : splits are done. If sep is not specified, any whitespace string\n\
    7617             : is a separator.");
    7618             : 
    7619             : static PyObject*
    7620           0 : unicode_rsplit(PyUnicodeObject *self, PyObject *args)
    7621             : {
    7622           0 :     PyObject *substring = Py_None;
    7623           0 :     Py_ssize_t maxcount = -1;
    7624             : 
    7625           0 :     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
    7626           0 :         return NULL;
    7627             : 
    7628           0 :     if (substring == Py_None)
    7629           0 :         return rsplit(self, NULL, maxcount);
    7630           0 :     else if (PyUnicode_Check(substring))
    7631           0 :         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
    7632             :     else
    7633           0 :         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
    7634             : }
    7635             : 
    7636             : PyDoc_STRVAR(splitlines__doc__,
    7637             :              "S.splitlines(keepends=False) -> list of strings\n\
    7638             : \n\
    7639             : Return a list of the lines in S, breaking at line boundaries.\n\
    7640             : Line breaks are not included in the resulting list unless keepends\n\
    7641             : is given and true.");
    7642             : 
    7643             : static PyObject*
    7644           0 : unicode_splitlines(PyUnicodeObject *self, PyObject *args)
    7645             : {
    7646           0 :     int keepends = 0;
    7647             : 
    7648           0 :     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
    7649           0 :         return NULL;
    7650             : 
    7651           0 :     return PyUnicode_Splitlines((PyObject *)self, keepends);
    7652             : }
    7653             : 
    7654             : static
    7655           0 : PyObject *unicode_str(PyUnicodeObject *self)
    7656             : {
    7657           0 :     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
    7658             : }
    7659             : 
    7660             : PyDoc_STRVAR(swapcase__doc__,
    7661             :              "S.swapcase() -> unicode\n\
    7662             : \n\
    7663             : Return a copy of S with uppercase characters converted to lowercase\n\
    7664             : and vice versa.");
    7665             : 
    7666             : static PyObject*
    7667           0 : unicode_swapcase(PyUnicodeObject *self)
    7668             : {
    7669           0 :     return fixup(self, fixswapcase);
    7670             : }
    7671             : 
    7672             : PyDoc_STRVAR(translate__doc__,
    7673             :              "S.translate(table) -> unicode\n\
    7674             : \n\
    7675             : Return a copy of the string S, where all characters have been mapped\n\
    7676             : through the given translation table, which must be a mapping of\n\
    7677             : Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
    7678             : Unmapped characters are left untouched. Characters mapped to None\n\
    7679             : are deleted.");
    7680             : 
    7681             : static PyObject*
    7682           0 : unicode_translate(PyUnicodeObject *self, PyObject *table)
    7683             : {
    7684           0 :     return PyUnicode_TranslateCharmap(self->str,
    7685             :                                       self->length,
    7686             :                                       table,
    7687             :                                       "ignore");
    7688             : }
    7689             : 
    7690             : PyDoc_STRVAR(upper__doc__,
    7691             :              "S.upper() -> unicode\n\
    7692             : \n\
    7693             : Return a copy of S converted to uppercase.");
    7694             : 
    7695             : static PyObject*
    7696           0 : unicode_upper(PyUnicodeObject *self)
    7697             : {
    7698           0 :     return fixup(self, fixupper);
    7699             : }
    7700             : 
    7701             : PyDoc_STRVAR(zfill__doc__,
    7702             :              "S.zfill(width) -> unicode\n\
    7703             : \n\
    7704             : Pad a numeric string S with zeros on the left, to fill a field\n\
    7705             : of the specified width. The string S is never truncated.");
    7706             : 
    7707             : static PyObject *
    7708           0 : unicode_zfill(PyUnicodeObject *self, PyObject *args)
    7709             : {
    7710             :     Py_ssize_t fill;
    7711             :     PyUnicodeObject *u;
    7712             : 
    7713             :     Py_ssize_t width;
    7714           0 :     if (!PyArg_ParseTuple(args, "n:zfill", &width))
    7715           0 :         return NULL;
    7716             : 
    7717           0 :     if (self->length >= width) {
    7718           0 :         if (PyUnicode_CheckExact(self)) {
    7719           0 :             Py_INCREF(self);
    7720           0 :             return (PyObject*) self;
    7721             :         }
    7722             :         else
    7723           0 :             return PyUnicode_FromUnicode(
    7724           0 :                 PyUnicode_AS_UNICODE(self),
    7725             :                 PyUnicode_GET_SIZE(self)
    7726             :                 );
    7727             :     }
    7728             : 
    7729           0 :     fill = width - self->length;
    7730             : 
    7731           0 :     u = pad(self, fill, 0, '0');
    7732             : 
    7733           0 :     if (u == NULL)
    7734           0 :         return NULL;
    7735             : 
    7736           0 :     if (u->str[fill] == '+' || u->str[fill] == '-') {
    7737             :         /* move sign to beginning of string */
    7738           0 :         u->str[0] = u->str[fill];
    7739           0 :         u->str[fill] = '0';
    7740             :     }
    7741             : 
    7742           0 :     return (PyObject*) u;
    7743             : }
    7744             : 
    7745             : #if 0
    7746             : static PyObject*
    7747             : free_listsize(PyUnicodeObject *self)
    7748             : {
    7749             :     return PyInt_FromLong(numfree);
    7750             : }
    7751             : #endif
    7752             : 
    7753             : PyDoc_STRVAR(startswith__doc__,
    7754             :              "S.startswith(prefix[, start[, end]]) -> bool\n\
    7755             : \n\
    7756             : Return True if S starts with the specified prefix, False otherwise.\n\
    7757             : With optional start, test S beginning at that position.\n\
    7758             : With optional end, stop comparing S at that position.\n\
    7759             : prefix can also be a tuple of strings to try.");
    7760             : 
    7761             : static PyObject *
    7762           0 : unicode_startswith(PyUnicodeObject *self,
    7763             :                    PyObject *args)
    7764             : {
    7765             :     PyObject *subobj;
    7766             :     PyUnicodeObject *substring;
    7767           0 :     Py_ssize_t start = 0;
    7768           0 :     Py_ssize_t end = PY_SSIZE_T_MAX;
    7769             :     int result;
    7770             : 
    7771           0 :     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
    7772           0 :         return NULL;
    7773           0 :     if (PyTuple_Check(subobj)) {
    7774             :         Py_ssize_t i;
    7775           0 :         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
    7776           0 :             substring = (PyUnicodeObject *)PyUnicode_FromObject(
    7777           0 :                 PyTuple_GET_ITEM(subobj, i));
    7778           0 :             if (substring == NULL)
    7779           0 :                 return NULL;
    7780           0 :             result = tailmatch(self, substring, start, end, -1);
    7781           0 :             Py_DECREF(substring);
    7782           0 :             if (result) {
    7783           0 :                 Py_RETURN_TRUE;
    7784             :             }
    7785             :         }
    7786             :         /* nothing matched */
    7787           0 :         Py_RETURN_FALSE;
    7788             :     }
    7789           0 :     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
    7790           0 :     if (substring == NULL) {
    7791           0 :         if (PyErr_ExceptionMatches(PyExc_TypeError))
    7792           0 :             PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
    7793           0 :                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
    7794           0 :         return NULL;
    7795             :     }
    7796           0 :     result = tailmatch(self, substring, start, end, -1);
    7797           0 :     Py_DECREF(substring);
    7798           0 :     return PyBool_FromLong(result);
    7799             : }
    7800             : 
    7801             : 
    7802             : PyDoc_STRVAR(endswith__doc__,
    7803             :              "S.endswith(suffix[, start[, end]]) -> bool\n\
    7804             : \n\
    7805             : Return True if S ends with the specified suffix, False otherwise.\n\
    7806             : With optional start, test S beginning at that position.\n\
    7807             : With optional end, stop comparing S at that position.\n\
    7808             : suffix can also be a tuple of strings to try.");
    7809             : 
    7810             : static PyObject *
    7811           0 : unicode_endswith(PyUnicodeObject *self,
    7812             :                  PyObject *args)
    7813             : {
    7814             :     PyObject *subobj;
    7815             :     PyUnicodeObject *substring;
    7816           0 :     Py_ssize_t start = 0;
    7817           0 :     Py_ssize_t end = PY_SSIZE_T_MAX;
    7818             :     int result;
    7819             : 
    7820           0 :     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
    7821           0 :         return NULL;
    7822           0 :     if (PyTuple_Check(subobj)) {
    7823             :         Py_ssize_t i;
    7824           0 :         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
    7825           0 :             substring = (PyUnicodeObject *)PyUnicode_FromObject(
    7826           0 :                 PyTuple_GET_ITEM(subobj, i));
    7827           0 :             if (substring == NULL)
    7828           0 :                 return NULL;
    7829           0 :             result = tailmatch(self, substring, start, end, +1);
    7830           0 :             Py_DECREF(substring);
    7831           0 :             if (result) {
    7832           0 :                 Py_RETURN_TRUE;
    7833             :             }
    7834             :         }
    7835           0 :         Py_RETURN_FALSE;
    7836             :     }
    7837           0 :     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
    7838           0 :     if (substring == NULL) {
    7839           0 :         if (PyErr_ExceptionMatches(PyExc_TypeError))
    7840           0 :             PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
    7841           0 :                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
    7842           0 :         return NULL;
    7843             :     }
    7844           0 :     result = tailmatch(self, substring, start, end, +1);
    7845           0 :     Py_DECREF(substring);
    7846           0 :     return PyBool_FromLong(result);
    7847             : }
    7848             : 
    7849             : 
    7850             : /* Implements do_string_format, which is unicode because of stringlib */
    7851             : #include "stringlib/string_format.h"
    7852             : 
    7853             : PyDoc_STRVAR(format__doc__,
    7854             :              "S.format(*args, **kwargs) -> unicode\n\
    7855             : \n\
    7856             : Return a formatted version of S, using substitutions from args and kwargs.\n\
    7857             : The substitutions are identified by braces ('{' and '}').");
    7858             : 
    7859             : static PyObject *
    7860           0 : unicode__format__(PyObject *self, PyObject *args)
    7861             : {
    7862             :     PyObject *format_spec;
    7863           0 :     PyObject *result = NULL;
    7864           0 :     PyObject *tmp = NULL;
    7865             : 
    7866             :     /* If 2.x, convert format_spec to the same type as value */
    7867             :     /* This is to allow things like u''.format('') */
    7868           0 :     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
    7869           0 :         goto done;
    7870           0 :     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
    7871           0 :         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
    7872           0 :                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
    7873           0 :         goto done;
    7874             :     }
    7875           0 :     tmp = PyObject_Unicode(format_spec);
    7876           0 :     if (tmp == NULL)
    7877           0 :         goto done;
    7878           0 :     format_spec = tmp;
    7879             : 
    7880           0 :     result = _PyUnicode_FormatAdvanced(self,
    7881           0 :                                        PyUnicode_AS_UNICODE(format_spec),
    7882           0 :                                        PyUnicode_GET_SIZE(format_spec));
    7883             :   done:
    7884           0 :     Py_XDECREF(tmp);
    7885           0 :     return result;
    7886             : }
    7887             : 
    7888             : PyDoc_STRVAR(p_format__doc__,
    7889             :              "S.__format__(format_spec) -> unicode\n\
    7890             : \n\
    7891             : Return a formatted version of S as described by format_spec.");
    7892             : 
    7893             : static PyObject *
    7894           0 : unicode__sizeof__(PyUnicodeObject *v)
    7895             : {
    7896           0 :     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
    7897           0 :                              sizeof(Py_UNICODE) * (v->length + 1));
    7898             : }
    7899             : 
    7900             : PyDoc_STRVAR(sizeof__doc__,
    7901             :              "S.__sizeof__() -> size of S in memory, in bytes\n\
    7902             : \n\
    7903             : ");
    7904             : 
    7905             : static PyObject *
    7906           0 : unicode_getnewargs(PyUnicodeObject *v)
    7907             : {
    7908           0 :     return Py_BuildValue("(u#)", v->str, v->length);
    7909             : }
    7910             : 
    7911             : 
    7912             : static PyMethodDef unicode_methods[] = {
    7913             :     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
    7914             :     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
    7915             :     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
    7916             :     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
    7917             :     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
    7918             :     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
    7919             :     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
    7920             :     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
    7921             :     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
    7922             :     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
    7923             :     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
    7924             :     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
    7925             :     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
    7926             :     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
    7927             :     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
    7928             :     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
    7929             :     {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
    7930             : /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
    7931             :     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
    7932             :     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
    7933             :     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
    7934             :     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
    7935             :     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
    7936             :     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
    7937             :     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
    7938             :     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
    7939             :     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
    7940             :     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
    7941             :     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
    7942             :     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
    7943             :     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
    7944             :     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
    7945             :     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
    7946             :     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
    7947             :     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
    7948             :     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
    7949             :     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
    7950             :     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
    7951             :     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
    7952             :     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
    7953             :     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
    7954             :     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
    7955             :     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
    7956             :     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
    7957             :     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
    7958             : #if 0
    7959             :     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
    7960             : #endif
    7961             : 
    7962             : #if 0
    7963             :     /* This one is just used for debugging the implementation. */
    7964             :     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
    7965             : #endif
    7966             : 
    7967             :     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
    7968             :     {NULL, NULL}
    7969             : };
    7970             : 
    7971             : static PyObject *
    7972           0 : unicode_mod(PyObject *v, PyObject *w)
    7973             : {
    7974           0 :     if (!PyUnicode_Check(v)) {
    7975           0 :         Py_INCREF(Py_NotImplemented);
    7976           0 :         return Py_NotImplemented;
    7977             :     }
    7978           0 :     return PyUnicode_Format(v, w);
    7979             : }
    7980             : 
    7981             : static PyNumberMethods unicode_as_number = {
    7982             :     0,              /*nb_add*/
    7983             :     0,              /*nb_subtract*/
    7984             :     0,              /*nb_multiply*/
    7985             :     0,              /*nb_divide*/
    7986             :     unicode_mod,            /*nb_remainder*/
    7987             : };
    7988             : 
    7989             : static PySequenceMethods unicode_as_sequence = {
    7990             :     (lenfunc) unicode_length,       /* sq_length */
    7991             :     PyUnicode_Concat,           /* sq_concat */
    7992             :     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
    7993             :     (ssizeargfunc) unicode_getitem,     /* sq_item */
    7994             :     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
    7995             :     0,                  /* sq_ass_item */
    7996             :     0,                  /* sq_ass_slice */
    7997             :     PyUnicode_Contains,         /* sq_contains */
    7998             : };
    7999             : 
    8000             : static PyObject*
    8001           0 : unicode_subscript(PyUnicodeObject* self, PyObject* item)
    8002             : {
    8003           0 :     if (PyIndex_Check(item)) {
    8004           0 :         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
    8005           0 :         if (i == -1 && PyErr_Occurred())
    8006           0 :             return NULL;
    8007           0 :         if (i < 0)
    8008           0 :             i += PyUnicode_GET_SIZE(self);
    8009           0 :         return unicode_getitem(self, i);
    8010           0 :     } else if (PySlice_Check(item)) {
    8011             :         Py_ssize_t start, stop, step, slicelength, cur, i;
    8012             :         Py_UNICODE* source_buf;
    8013             :         Py_UNICODE* result_buf;
    8014             :         PyObject* result;
    8015             : 
    8016           0 :         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
    8017             :                                  &start, &stop, &step, &slicelength) < 0) {
    8018           0 :             return NULL;
    8019             :         }
    8020             : 
    8021           0 :         if (slicelength <= 0) {
    8022           0 :             return PyUnicode_FromUnicode(NULL, 0);
    8023           0 :         } else if (start == 0 && step == 1 && slicelength == self->length &&
    8024           0 :                    PyUnicode_CheckExact(self)) {
    8025           0 :             Py_INCREF(self);
    8026           0 :             return (PyObject *)self;
    8027           0 :         } else if (step == 1) {
    8028           0 :             return PyUnicode_FromUnicode(self->str + start, slicelength);
    8029             :         } else {
    8030           0 :             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
    8031           0 :             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
    8032             :                                                        sizeof(Py_UNICODE));
    8033             : 
    8034           0 :             if (result_buf == NULL)
    8035           0 :                 return PyErr_NoMemory();
    8036             : 
    8037           0 :             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
    8038           0 :                 result_buf[i] = source_buf[cur];
    8039             :             }
    8040             : 
    8041           0 :             result = PyUnicode_FromUnicode(result_buf, slicelength);
    8042           0 :             PyObject_FREE(result_buf);
    8043           0 :             return result;
    8044             :         }
    8045             :     } else {
    8046           0 :         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
    8047           0 :         return NULL;
    8048             :     }
    8049             : }
    8050             : 
    8051             : static PyMappingMethods unicode_as_mapping = {
    8052             :     (lenfunc)unicode_length,        /* mp_length */
    8053             :     (binaryfunc)unicode_subscript,  /* mp_subscript */
    8054             :     (objobjargproc)0,           /* mp_ass_subscript */
    8055             : };
    8056             : 
    8057             : static Py_ssize_t
    8058           0 : unicode_buffer_getreadbuf(PyUnicodeObject *self,
    8059             :                           Py_ssize_t index,
    8060             :                           const void **ptr)
    8061             : {
    8062           0 :     if (index != 0) {
    8063           0 :         PyErr_SetString(PyExc_SystemError,
    8064             :                         "accessing non-existent unicode segment");
    8065           0 :         return -1;
    8066             :     }
    8067           0 :     *ptr = (void *) self->str;
    8068           0 :     return PyUnicode_GET_DATA_SIZE(self);
    8069             : }
    8070             : 
    8071             : static Py_ssize_t
    8072           0 : unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
    8073             :                            const void **ptr)
    8074             : {
    8075           0 :     PyErr_SetString(PyExc_TypeError,
    8076             :                     "cannot use unicode as modifiable buffer");
    8077           0 :     return -1;
    8078             : }
    8079             : 
    8080             : static int
    8081           0 : unicode_buffer_getsegcount(PyUnicodeObject *self,
    8082             :                            Py_ssize_t *lenp)
    8083             : {
    8084           0 :     if (lenp)
    8085           0 :         *lenp = PyUnicode_GET_DATA_SIZE(self);
    8086           0 :     return 1;
    8087             : }
    8088             : 
    8089             : static Py_ssize_t
    8090           0 : unicode_buffer_getcharbuf(PyUnicodeObject *self,
    8091             :                           Py_ssize_t index,
    8092             :                           const void **ptr)
    8093             : {
    8094             :     PyObject *str;
    8095             : 
    8096           0 :     if (index != 0) {
    8097           0 :         PyErr_SetString(PyExc_SystemError,
    8098             :                         "accessing non-existent unicode segment");
    8099           0 :         return -1;
    8100             :     }
    8101           0 :     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
    8102           0 :     if (str == NULL)
    8103           0 :         return -1;
    8104           0 :     *ptr = (void *) PyString_AS_STRING(str);
    8105           0 :     return PyString_GET_SIZE(str);
    8106             : }
    8107             : 
    8108             : /* Helpers for PyUnicode_Format() */
    8109             : 
    8110             : static PyObject *
    8111           0 : getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
    8112             : {
    8113           0 :     Py_ssize_t argidx = *p_argidx;
    8114           0 :     if (argidx < arglen) {
    8115           0 :         (*p_argidx)++;
    8116           0 :         if (arglen < 0)
    8117           0 :             return args;
    8118             :         else
    8119           0 :             return PyTuple_GetItem(args, argidx);
    8120             :     }
    8121           0 :     PyErr_SetString(PyExc_TypeError,
    8122             :                     "not enough arguments for format string");
    8123           0 :     return NULL;
    8124             : }
    8125             : 
    8126             : #define F_LJUST (1<<0)
    8127             : #define F_SIGN  (1<<1)
    8128             : #define F_BLANK (1<<2)
    8129             : #define F_ALT   (1<<3)
    8130             : #define F_ZERO  (1<<4)
    8131             : 
    8132             : static Py_ssize_t
    8133           0 : strtounicode(Py_UNICODE *buffer, const char *charbuffer)
    8134             : {
    8135             :     register Py_ssize_t i;
    8136           0 :     Py_ssize_t len = strlen(charbuffer);
    8137           0 :     for (i = len - 1; i >= 0; i--)
    8138           0 :         buffer[i] = (Py_UNICODE) charbuffer[i];
    8139             : 
    8140           0 :     return len;
    8141             : }
    8142             : 
    8143             : static int
    8144           0 : longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
    8145             : {
    8146             :     Py_ssize_t result;
    8147             : 
    8148           0 :     PyOS_snprintf((char *)buffer, len, format, x);
    8149           0 :     result = strtounicode(buffer, (char *)buffer);
    8150           0 :     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
    8151             : }
    8152             : 
    8153             : /* XXX To save some code duplication, formatfloat/long/int could have been
    8154             :    shared with stringobject.c, converting from 8-bit to Unicode after the
    8155             :    formatting is done. */
    8156             : 
    8157             : /* Returns a new reference to a PyUnicode object, or NULL on failure. */
    8158             : 
    8159             : static PyObject *
    8160           0 : formatfloat(PyObject *v, int flags, int prec, int type)
    8161             : {
    8162             :     char *p;
    8163             :     PyObject *result;
    8164             :     double x;
    8165             : 
    8166           0 :     x = PyFloat_AsDouble(v);
    8167           0 :     if (x == -1.0 && PyErr_Occurred())
    8168           0 :         return NULL;
    8169             : 
    8170           0 :     if (prec < 0)
    8171           0 :         prec = 6;
    8172             : 
    8173           0 :     p = PyOS_double_to_string(x, type, prec,
    8174           0 :                               (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
    8175           0 :     if (p == NULL)
    8176           0 :         return NULL;
    8177           0 :     result = PyUnicode_FromStringAndSize(p, strlen(p));
    8178           0 :     PyMem_Free(p);
    8179           0 :     return result;
    8180             : }
    8181             : 
    8182             : static PyObject*
    8183           0 : formatlong(PyObject *val, int flags, int prec, int type)
    8184             : {
    8185             :     char *buf;
    8186             :     int i, len;
    8187             :     PyObject *str; /* temporary string object. */
    8188             :     PyUnicodeObject *result;
    8189             : 
    8190           0 :     str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
    8191           0 :     if (!str)
    8192           0 :         return NULL;
    8193           0 :     result = _PyUnicode_New(len);
    8194           0 :     if (!result) {
    8195           0 :         Py_DECREF(str);
    8196           0 :         return NULL;
    8197             :     }
    8198           0 :     for (i = 0; i < len; i++)
    8199           0 :         result->str[i] = buf[i];
    8200           0 :     result->str[len] = 0;
    8201           0 :     Py_DECREF(str);
    8202           0 :     return (PyObject*)result;
    8203             : }
    8204             : 
    8205             : static int
    8206           0 : formatint(Py_UNICODE *buf,
    8207             :           size_t buflen,
    8208             :           int flags,
    8209             :           int prec,
    8210             :           int type,
    8211             :           PyObject *v)
    8212             : {
    8213             :     /* fmt = '%#.' + `prec` + 'l' + `type`
    8214             :      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
    8215             :      *                     + 1 + 1
    8216             :      *                   = 24
    8217             :      */
    8218             :     char fmt[64]; /* plenty big enough! */
    8219             :     char *sign;
    8220             :     long x;
    8221             : 
    8222           0 :     x = PyInt_AsLong(v);
    8223           0 :     if (x == -1 && PyErr_Occurred())
    8224           0 :         return -1;
    8225           0 :     if (x < 0 && type == 'u') {
    8226           0 :         type = 'd';
    8227             :     }
    8228           0 :     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
    8229           0 :         sign = "-";
    8230             :     else
    8231           0 :         sign = "";
    8232           0 :     if (prec < 0)
    8233           0 :         prec = 1;
    8234             : 
    8235             :     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
    8236             :      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
    8237             :      */
    8238           0 :     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
    8239           0 :         PyErr_SetString(PyExc_OverflowError,
    8240             :                         "formatted integer is too long (precision too large?)");
    8241           0 :         return -1;
    8242             :     }
    8243             : 
    8244           0 :     if ((flags & F_ALT) &&
    8245           0 :         (type == 'x' || type == 'X')) {
    8246             :         /* When converting under %#x or %#X, there are a number
    8247             :          * of issues that cause pain:
    8248             :          * - when 0 is being converted, the C standard leaves off
    8249             :          *   the '0x' or '0X', which is inconsistent with other
    8250             :          *   %#x/%#X conversions and inconsistent with Python's
    8251             :          *   hex() function
    8252             :          * - there are platforms that violate the standard and
    8253             :          *   convert 0 with the '0x' or '0X'
    8254             :          *   (Metrowerks, Compaq Tru64)
    8255             :          * - there are platforms that give '0x' when converting
    8256             :          *   under %#X, but convert 0 in accordance with the
    8257             :          *   standard (OS/2 EMX)
    8258             :          *
    8259             :          * We can achieve the desired consistency by inserting our
    8260             :          * own '0x' or '0X' prefix, and substituting %x/%X in place
    8261             :          * of %#x/%#X.
    8262             :          *
    8263             :          * Note that this is the same approach as used in
    8264             :          * formatint() in stringobject.c
    8265             :          */
    8266           0 :         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
    8267             :                       sign, type, prec, type);
    8268             :     }
    8269             :     else {
    8270           0 :         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
    8271           0 :                       sign, (flags&F_ALT) ? "#" : "",
    8272             :                       prec, type);
    8273             :     }
    8274           0 :     if (sign[0])
    8275           0 :         return longtounicode(buf, buflen, fmt, -x);
    8276             :     else
    8277           0 :         return longtounicode(buf, buflen, fmt, x);
    8278             : }
    8279             : 
    8280             : static int
    8281           0 : formatchar(Py_UNICODE *buf,
    8282             :            size_t buflen,
    8283             :            PyObject *v)
    8284             : {
    8285             :     PyObject *unistr;
    8286             :     char *str;
    8287             :     /* presume that the buffer is at least 2 characters long */
    8288           0 :     if (PyUnicode_Check(v)) {
    8289           0 :         if (PyUnicode_GET_SIZE(v) != 1)
    8290           0 :             goto onError;
    8291           0 :         buf[0] = PyUnicode_AS_UNICODE(v)[0];
    8292             :     }
    8293             : 
    8294           0 :     else if (PyString_Check(v)) {
    8295           0 :         if (PyString_GET_SIZE(v) != 1)
    8296           0 :             goto onError;
    8297             :         /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
    8298             :            with a UnicodeDecodeError if 'char' is not decodable with the
    8299             :            default encoding (usually ASCII, but it might be something else) */
    8300           0 :         str = PyString_AS_STRING(v);
    8301           0 :         if ((unsigned char)str[0] > 0x7F) {
    8302             :             /* the char is not ASCII; try to decode the string using the
    8303             :                default encoding and return -1 to let the UnicodeDecodeError
    8304             :                be raised if the string can't be decoded */
    8305           0 :             unistr = PyUnicode_Decode(str, 1, NULL, "strict");
    8306           0 :             if (unistr == NULL)
    8307           0 :                 return -1;
    8308           0 :             buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
    8309           0 :             Py_DECREF(unistr);
    8310             :         }
    8311             :         else
    8312           0 :             buf[0] = (Py_UNICODE)str[0];
    8313             :     }
    8314             : 
    8315             :     else {
    8316             :         /* Integer input truncated to a character */
    8317             :         long x;
    8318           0 :         x = PyInt_AsLong(v);
    8319           0 :         if (x == -1 && PyErr_Occurred())
    8320           0 :             goto onError;
    8321             : #ifdef Py_UNICODE_WIDE
    8322             :         if (x < 0 || x > 0x10ffff) {
    8323             :             PyErr_SetString(PyExc_OverflowError,
    8324             :                             "%c arg not in range(0x110000) "
    8325             :                             "(wide Python build)");
    8326             :             return -1;
    8327             :         }
    8328             : #else
    8329           0 :         if (x < 0 || x > 0xffff) {
    8330           0 :             PyErr_SetString(PyExc_OverflowError,
    8331             :                             "%c arg not in range(0x10000) "
    8332             :                             "(narrow Python build)");
    8333           0 :             return -1;
    8334             :         }
    8335             : #endif
    8336           0 :         buf[0] = (Py_UNICODE) x;
    8337             :     }
    8338           0 :     buf[1] = '\0';
    8339           0 :     return 1;
    8340             : 
    8341             :   onError:
    8342           0 :     PyErr_SetString(PyExc_TypeError,
    8343             :                     "%c requires int or char");
    8344           0 :     return -1;
    8345             : }
    8346             : 
    8347             : /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
    8348             : 
    8349             :    FORMATBUFLEN is the length of the buffer in which the ints &
    8350             :    chars are formatted. XXX This is a magic number. Each formatting
    8351             :    routine does bounds checking to ensure no overflow, but a better
    8352             :    solution may be to malloc a buffer of appropriate size for each
    8353             :    format. For now, the current solution is sufficient.
    8354             : */
    8355             : #define FORMATBUFLEN (size_t)120
    8356             : 
    8357           0 : PyObject *PyUnicode_Format(PyObject *format,
    8358             :                            PyObject *args)
    8359             : {
    8360             :     Py_UNICODE *fmt, *res;
    8361             :     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
    8362           0 :     int args_owned = 0;
    8363           0 :     PyUnicodeObject *result = NULL;
    8364           0 :     PyObject *dict = NULL;
    8365             :     PyObject *uformat;
    8366             : 
    8367           0 :     if (format == NULL || args == NULL) {
    8368           0 :         PyErr_BadInternalCall();
    8369           0 :         return NULL;
    8370             :     }
    8371           0 :     uformat = PyUnicode_FromObject(format);
    8372           0 :     if (uformat == NULL)
    8373           0 :         return NULL;
    8374           0 :     fmt = PyUnicode_AS_UNICODE(uformat);
    8375           0 :     fmtcnt = PyUnicode_GET_SIZE(uformat);
    8376             : 
    8377           0 :     reslen = rescnt = fmtcnt + 100;
    8378           0 :     result = _PyUnicode_New(reslen);
    8379           0 :     if (result == NULL)
    8380           0 :         goto onError;
    8381           0 :     res = PyUnicode_AS_UNICODE(result);
    8382             : 
    8383           0 :     if (PyTuple_Check(args)) {
    8384           0 :         arglen = PyTuple_Size(args);
    8385           0 :         argidx = 0;
    8386             :     }
    8387             :     else {
    8388           0 :         arglen = -1;
    8389           0 :         argidx = -2;
    8390             :     }
    8391           0 :     if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
    8392           0 :         !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
    8393           0 :         dict = args;
    8394             : 
    8395           0 :     while (--fmtcnt >= 0) {
    8396           0 :         if (*fmt != '%') {
    8397           0 :             if (--rescnt < 0) {
    8398           0 :                 rescnt = fmtcnt + 100;
    8399           0 :                 reslen += rescnt;
    8400           0 :                 if (_PyUnicode_Resize(&result, reslen) < 0)
    8401           0 :                     goto onError;
    8402           0 :                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
    8403           0 :                 --rescnt;
    8404             :             }
    8405           0 :             *res++ = *fmt++;
    8406             :         }
    8407             :         else {
    8408             :             /* Got a format specifier */
    8409           0 :             int flags = 0;
    8410           0 :             Py_ssize_t width = -1;
    8411           0 :             int prec = -1;
    8412           0 :             Py_UNICODE c = '\0';
    8413             :             Py_UNICODE fill;
    8414             :             int isnumok;
    8415           0 :             PyObject *v = NULL;
    8416           0 :             PyObject *temp = NULL;
    8417             :             Py_UNICODE *pbuf;
    8418             :             Py_UNICODE sign;
    8419             :             Py_ssize_t len;
    8420             :             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
    8421             : 
    8422           0 :             fmt++;
    8423           0 :             if (*fmt == '(') {
    8424             :                 Py_UNICODE *keystart;
    8425             :                 Py_ssize_t keylen;
    8426             :                 PyObject *key;
    8427           0 :                 int pcount = 1;
    8428             : 
    8429           0 :                 if (dict == NULL) {
    8430           0 :                     PyErr_SetString(PyExc_TypeError,
    8431             :                                     "format requires a mapping");
    8432           0 :                     goto onError;
    8433             :                 }
    8434           0 :                 ++fmt;
    8435           0 :                 --fmtcnt;
    8436           0 :                 keystart = fmt;
    8437             :                 /* Skip over balanced parentheses */
    8438           0 :                 while (pcount > 0 && --fmtcnt >= 0) {
    8439           0 :                     if (*fmt == ')')
    8440           0 :                         --pcount;
    8441           0 :                     else if (*fmt == '(')
    8442           0 :                         ++pcount;
    8443           0 :                     fmt++;
    8444             :                 }
    8445           0 :                 keylen = fmt - keystart - 1;
    8446           0 :                 if (fmtcnt < 0 || pcount > 0) {
    8447           0 :                     PyErr_SetString(PyExc_ValueError,
    8448             :                                     "incomplete format key");
    8449           0 :                     goto onError;
    8450             :                 }
    8451             : #if 0
    8452             :                 /* keys are converted to strings using UTF-8 and
    8453             :                    then looked up since Python uses strings to hold
    8454             :                    variables names etc. in its namespaces and we
    8455             :                    wouldn't want to break common idioms. */
    8456             :                 key = PyUnicode_EncodeUTF8(keystart,
    8457             :                                            keylen,
    8458             :                                            NULL);
    8459             : #else
    8460           0 :                 key = PyUnicode_FromUnicode(keystart, keylen);
    8461             : #endif
    8462           0 :                 if (key == NULL)
    8463           0 :                     goto onError;
    8464           0 :                 if (args_owned) {
    8465           0 :                     Py_DECREF(args);
    8466           0 :                     args_owned = 0;
    8467             :                 }
    8468           0 :                 args = PyObject_GetItem(dict, key);
    8469           0 :                 Py_DECREF(key);
    8470           0 :                 if (args == NULL) {
    8471           0 :                     goto onError;
    8472             :                 }
    8473           0 :                 args_owned = 1;
    8474           0 :                 arglen = -1;
    8475           0 :                 argidx = -2;
    8476             :             }
    8477           0 :             while (--fmtcnt >= 0) {
    8478           0 :                 switch (c = *fmt++) {
    8479           0 :                 case '-': flags |= F_LJUST; continue;
    8480           0 :                 case '+': flags |= F_SIGN; continue;
    8481           0 :                 case ' ': flags |= F_BLANK; continue;
    8482           0 :                 case '#': flags |= F_ALT; continue;
    8483           0 :                 case '0': flags |= F_ZERO; continue;
    8484             :                 }
    8485           0 :                 break;
    8486             :             }
    8487           0 :             if (c == '*') {
    8488           0 :                 v = getnextarg(args, arglen, &argidx);
    8489           0 :                 if (v == NULL)
    8490           0 :                     goto onError;
    8491           0 :                 if (!PyInt_Check(v)) {
    8492           0 :                     PyErr_SetString(PyExc_TypeError,
    8493             :                                     "* wants int");
    8494           0 :                     goto onError;
    8495             :                 }
    8496           0 :                 width = PyInt_AsSsize_t(v);
    8497           0 :                 if (width == -1 && PyErr_Occurred())
    8498           0 :                     goto onError;
    8499           0 :                 if (width < 0) {
    8500           0 :                     flags |= F_LJUST;
    8501           0 :                     width = -width;
    8502             :                 }
    8503           0 :                 if (--fmtcnt >= 0)
    8504           0 :                     c = *fmt++;
    8505             :             }
    8506           0 :             else if (c >= '0' && c <= '9') {
    8507           0 :                 width = c - '0';
    8508           0 :                 while (--fmtcnt >= 0) {
    8509           0 :                     c = *fmt++;
    8510           0 :                     if (c < '0' || c > '9')
    8511             :                         break;
    8512           0 :                     if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
    8513           0 :                         PyErr_SetString(PyExc_ValueError,
    8514             :                                         "width too big");
    8515           0 :                         goto onError;
    8516             :                     }
    8517           0 :                     width = width*10 + (c - '0');
    8518             :                 }
    8519             :             }
    8520           0 :             if (c == '.') {
    8521           0 :                 prec = 0;
    8522           0 :                 if (--fmtcnt >= 0)
    8523           0 :                     c = *fmt++;
    8524           0 :                 if (c == '*') {
    8525           0 :                     v = getnextarg(args, arglen, &argidx);
    8526           0 :                     if (v == NULL)
    8527           0 :                         goto onError;
    8528           0 :                     if (!PyInt_Check(v)) {
    8529           0 :                         PyErr_SetString(PyExc_TypeError,
    8530             :                                         "* wants int");
    8531           0 :                         goto onError;
    8532             :                     }
    8533           0 :                     prec = _PyInt_AsInt(v);
    8534           0 :                     if (prec == -1 && PyErr_Occurred())
    8535           0 :                         goto onError;
    8536           0 :                     if (prec < 0)
    8537           0 :                         prec = 0;
    8538           0 :                     if (--fmtcnt >= 0)
    8539           0 :                         c = *fmt++;
    8540             :                 }
    8541           0 :                 else if (c >= '0' && c <= '9') {
    8542           0 :                     prec = c - '0';
    8543           0 :                     while (--fmtcnt >= 0) {
    8544           0 :                         c = *fmt++;
    8545           0 :                         if (c < '0' || c > '9')
    8546             :                             break;
    8547           0 :                         if (prec > (INT_MAX - ((int)c - '0')) / 10) {
    8548           0 :                             PyErr_SetString(PyExc_ValueError,
    8549             :                                             "prec too big");
    8550           0 :                             goto onError;
    8551             :                         }
    8552           0 :                         prec = prec*10 + (c - '0');
    8553             :                     }
    8554             :                 }
    8555             :             } /* prec */
    8556           0 :             if (fmtcnt >= 0) {
    8557           0 :                 if (c == 'h' || c == 'l' || c == 'L') {
    8558           0 :                     if (--fmtcnt >= 0)
    8559           0 :                         c = *fmt++;
    8560             :                 }
    8561             :             }
    8562           0 :             if (fmtcnt < 0) {
    8563           0 :                 PyErr_SetString(PyExc_ValueError,
    8564             :                                 "incomplete format");
    8565           0 :                 goto onError;
    8566             :             }
    8567           0 :             if (c != '%') {
    8568           0 :                 v = getnextarg(args, arglen, &argidx);
    8569           0 :                 if (v == NULL)
    8570           0 :                     goto onError;
    8571             :             }
    8572           0 :             sign = 0;
    8573           0 :             fill = ' ';
    8574           0 :             switch (c) {
    8575             : 
    8576             :             case '%':
    8577           0 :                 pbuf = formatbuf;
    8578             :                 /* presume that buffer length is at least 1 */
    8579           0 :                 pbuf[0] = '%';
    8580           0 :                 len = 1;
    8581           0 :                 break;
    8582             : 
    8583             :             case 's':
    8584             :             case 'r':
    8585           0 :                 if (PyUnicode_CheckExact(v) && c == 's') {
    8586           0 :                     temp = v;
    8587           0 :                     Py_INCREF(temp);
    8588             :                 }
    8589             :                 else {
    8590             :                     PyObject *unicode;
    8591           0 :                     if (c == 's')
    8592           0 :                         temp = PyObject_Unicode(v);
    8593             :                     else
    8594           0 :                         temp = PyObject_Repr(v);
    8595           0 :                     if (temp == NULL)
    8596           0 :                         goto onError;
    8597           0 :                     if (PyUnicode_Check(temp))
    8598             :                         /* nothing to do */;
    8599           0 :                     else if (PyString_Check(temp)) {
    8600             :                         /* convert to string to Unicode */
    8601           0 :                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
    8602             :                                                    PyString_GET_SIZE(temp),
    8603             :                                                    NULL,
    8604             :                                                    "strict");
    8605           0 :                         Py_DECREF(temp);
    8606           0 :                         temp = unicode;
    8607           0 :                         if (temp == NULL)
    8608           0 :                             goto onError;
    8609             :                     }
    8610             :                     else {
    8611           0 :                         Py_DECREF(temp);
    8612           0 :                         PyErr_SetString(PyExc_TypeError,
    8613             :                                         "%s argument has non-string str()");
    8614           0 :                         goto onError;
    8615             :                     }
    8616             :                 }
    8617           0 :                 pbuf = PyUnicode_AS_UNICODE(temp);
    8618           0 :                 len = PyUnicode_GET_SIZE(temp);
    8619           0 :                 if (prec >= 0 && len > prec)
    8620           0 :                     len = prec;
    8621           0 :                 break;
    8622             : 
    8623             :             case 'i':
    8624             :             case 'd':
    8625             :             case 'u':
    8626             :             case 'o':
    8627             :             case 'x':
    8628             :             case 'X':
    8629           0 :                 if (c == 'i')
    8630           0 :                     c = 'd';
    8631           0 :                 isnumok = 0;
    8632           0 :                 if (PyNumber_Check(v)) {
    8633           0 :                     PyObject *iobj=NULL;
    8634             : 
    8635           0 :                     if (PyInt_Check(v) || (PyLong_Check(v))) {
    8636           0 :                         iobj = v;
    8637           0 :                         Py_INCREF(iobj);
    8638             :                     }
    8639             :                     else {
    8640           0 :                         iobj = PyNumber_Int(v);
    8641           0 :                         if (iobj==NULL) {
    8642           0 :                             PyErr_Clear();
    8643           0 :                             iobj = PyNumber_Long(v);
    8644             :                         }
    8645             :                     }
    8646           0 :                     if (iobj!=NULL) {
    8647           0 :                         if (PyInt_Check(iobj)) {
    8648           0 :                             isnumok = 1;
    8649           0 :                             pbuf = formatbuf;
    8650           0 :                             len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
    8651             :                                             flags, prec, c, iobj);
    8652           0 :                             Py_DECREF(iobj);
    8653           0 :                             if (len < 0)
    8654           0 :                                 goto onError;
    8655           0 :                             sign = 1;
    8656             :                         }
    8657           0 :                         else if (PyLong_Check(iobj)) {
    8658           0 :                             isnumok = 1;
    8659           0 :                             temp = formatlong(iobj, flags, prec, c);
    8660           0 :                             Py_DECREF(iobj);
    8661           0 :                             if (!temp)
    8662           0 :                                 goto onError;
    8663           0 :                             pbuf = PyUnicode_AS_UNICODE(temp);
    8664           0 :                             len = PyUnicode_GET_SIZE(temp);
    8665           0 :                             sign = 1;
    8666             :                         }
    8667             :                         else {
    8668           0 :                             Py_DECREF(iobj);
    8669             :                         }
    8670             :                     }
    8671             :                 }
    8672           0 :                 if (!isnumok) {
    8673           0 :                     PyErr_Format(PyExc_TypeError,
    8674             :                                  "%%%c format: a number is required, "
    8675           0 :                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
    8676           0 :                     goto onError;
    8677             :                 }
    8678           0 :                 if (flags & F_ZERO)
    8679           0 :                     fill = '0';
    8680           0 :                 break;
    8681             : 
    8682             :             case 'e':
    8683             :             case 'E':
    8684             :             case 'f':
    8685             :             case 'F':
    8686             :             case 'g':
    8687             :             case 'G':
    8688           0 :                 temp = formatfloat(v, flags, prec, c);
    8689           0 :                 if (temp == NULL)
    8690           0 :                     goto onError;
    8691           0 :                 pbuf = PyUnicode_AS_UNICODE(temp);
    8692           0 :                 len = PyUnicode_GET_SIZE(temp);
    8693           0 :                 sign = 1;
    8694           0 :                 if (flags & F_ZERO)
    8695           0 :                     fill = '0';
    8696           0 :                 break;
    8697             : 
    8698             :             case 'c':
    8699           0 :                 pbuf = formatbuf;
    8700           0 :                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
    8701           0 :                 if (len < 0)
    8702           0 :                     goto onError;
    8703           0 :                 break;
    8704             : 
    8705             :             default:
    8706           0 :                 PyErr_Format(PyExc_ValueError,
    8707             :                              "unsupported format character '%c' (0x%x) "
    8708             :                              "at index %zd",
    8709           0 :                              (31<=c && c<=126) ? (char)c : '?',
    8710             :                              (int)c,
    8711           0 :                              (Py_ssize_t)(fmt - 1 -
    8712           0 :                                           PyUnicode_AS_UNICODE(uformat)));
    8713           0 :                 goto onError;
    8714             :             }
    8715           0 :             if (sign) {
    8716           0 :                 if (*pbuf == '-' || *pbuf == '+') {
    8717           0 :                     sign = *pbuf++;
    8718           0 :                     len--;
    8719             :                 }
    8720           0 :                 else if (flags & F_SIGN)
    8721           0 :                     sign = '+';
    8722           0 :                 else if (flags & F_BLANK)
    8723           0 :                     sign = ' ';
    8724             :                 else
    8725           0 :                     sign = 0;
    8726             :             }
    8727           0 :             if (width < len)
    8728           0 :                 width = len;
    8729           0 :             if (rescnt - (sign != 0) < width) {
    8730           0 :                 reslen -= rescnt;
    8731           0 :                 rescnt = width + fmtcnt + 100;
    8732           0 :                 reslen += rescnt;
    8733           0 :                 if (reslen < 0) {
    8734           0 :                     Py_XDECREF(temp);
    8735           0 :                     PyErr_NoMemory();
    8736           0 :                     goto onError;
    8737             :                 }
    8738           0 :                 if (_PyUnicode_Resize(&result, reslen) < 0) {
    8739           0 :                     Py_XDECREF(temp);
    8740           0 :                     goto onError;
    8741             :                 }
    8742           0 :                 res = PyUnicode_AS_UNICODE(result)
    8743           0 :                     + reslen - rescnt;
    8744             :             }
    8745           0 :             if (sign) {
    8746           0 :                 if (fill != ' ')
    8747           0 :                     *res++ = sign;
    8748           0 :                 rescnt--;
    8749           0 :                 if (width > len)
    8750           0 :                     width--;
    8751             :             }
    8752           0 :             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
    8753             :                 assert(pbuf[0] == '0');
    8754             :                 assert(pbuf[1] == c);
    8755           0 :                 if (fill != ' ') {
    8756           0 :                     *res++ = *pbuf++;
    8757           0 :                     *res++ = *pbuf++;
    8758             :                 }
    8759           0 :                 rescnt -= 2;
    8760           0 :                 width -= 2;
    8761           0 :                 if (width < 0)
    8762           0 :                     width = 0;
    8763           0 :                 len -= 2;
    8764             :             }
    8765           0 :             if (width > len && !(flags & F_LJUST)) {
    8766             :                 do {
    8767           0 :                     --rescnt;
    8768           0 :                     *res++ = fill;
    8769           0 :                 } while (--width > len);
    8770             :             }
    8771           0 :             if (fill == ' ') {
    8772           0 :                 if (sign)
    8773           0 :                     *res++ = sign;
    8774           0 :                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
    8775             :                     assert(pbuf[0] == '0');
    8776             :                     assert(pbuf[1] == c);
    8777           0 :                     *res++ = *pbuf++;
    8778           0 :                     *res++ = *pbuf++;
    8779             :                 }
    8780             :             }
    8781           0 :             Py_UNICODE_COPY(res, pbuf, len);
    8782           0 :             res += len;
    8783           0 :             rescnt -= len;
    8784           0 :             while (--width >= len) {
    8785           0 :                 --rescnt;
    8786           0 :                 *res++ = ' ';
    8787             :             }
    8788           0 :             if (dict && (argidx < arglen) && c != '%') {
    8789           0 :                 PyErr_SetString(PyExc_TypeError,
    8790             :                                 "not all arguments converted during string formatting");
    8791           0 :                 Py_XDECREF(temp);
    8792           0 :                 goto onError;
    8793             :             }
    8794           0 :             Py_XDECREF(temp);
    8795             :         } /* '%' */
    8796             :     } /* until end */
    8797           0 :     if (argidx < arglen && !dict) {
    8798           0 :         PyErr_SetString(PyExc_TypeError,
    8799             :                         "not all arguments converted during string formatting");
    8800           0 :         goto onError;
    8801             :     }
    8802             : 
    8803           0 :     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
    8804           0 :         goto onError;
    8805           0 :     if (args_owned) {
    8806           0 :         Py_DECREF(args);
    8807             :     }
    8808           0 :     Py_DECREF(uformat);
    8809           0 :     return (PyObject *)result;
    8810             : 
    8811             :   onError:
    8812           0 :     Py_XDECREF(result);
    8813           0 :     Py_DECREF(uformat);
    8814           0 :     if (args_owned) {
    8815           0 :         Py_DECREF(args);
    8816             :     }
    8817           0 :     return NULL;
    8818             : }
    8819             : 
    8820             : static PyBufferProcs unicode_as_buffer = {
    8821             :     (readbufferproc) unicode_buffer_getreadbuf,
    8822             :     (writebufferproc) unicode_buffer_getwritebuf,
    8823             :     (segcountproc) unicode_buffer_getsegcount,
    8824             :     (charbufferproc) unicode_buffer_getcharbuf,
    8825             : };
    8826             : 
    8827             : static PyObject *
    8828             : unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
    8829             : 
    8830             : static PyObject *
    8831           3 : unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
    8832             : {
    8833           3 :     PyObject *x = NULL;
    8834             :     static char *kwlist[] = {"string", "encoding", "errors", 0};
    8835           3 :     char *encoding = NULL;
    8836           3 :     char *errors = NULL;
    8837             : 
    8838           3 :     if (type != &PyUnicode_Type)
    8839           0 :         return unicode_subtype_new(type, args, kwds);
    8840           3 :     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
    8841             :                                      kwlist, &x, &encoding, &errors))
    8842           0 :         return NULL;
    8843           3 :     if (x == NULL)
    8844           0 :         return (PyObject *)_PyUnicode_New(0);
    8845           3 :     if (encoding == NULL && errors == NULL)
    8846           3 :         return PyObject_Unicode(x);
    8847             :     else
    8848           0 :         return PyUnicode_FromEncodedObject(x, encoding, errors);
    8849             : }
    8850             : 
    8851             : static PyObject *
    8852           0 : unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
    8853             : {
    8854             :     PyUnicodeObject *tmp, *pnew;
    8855             :     Py_ssize_t n;
    8856             : 
    8857             :     assert(PyType_IsSubtype(type, &PyUnicode_Type));
    8858           0 :     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
    8859           0 :     if (tmp == NULL)
    8860           0 :         return NULL;
    8861             :     assert(PyUnicode_Check(tmp));
    8862           0 :     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
    8863           0 :     if (pnew == NULL) {
    8864           0 :         Py_DECREF(tmp);
    8865           0 :         return NULL;
    8866             :     }
    8867           0 :     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
    8868           0 :     if (pnew->str == NULL) {
    8869             :         _Py_ForgetReference((PyObject *)pnew);
    8870           0 :         PyObject_Del(pnew);
    8871           0 :         Py_DECREF(tmp);
    8872           0 :         return PyErr_NoMemory();
    8873             :     }
    8874           0 :     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
    8875           0 :     pnew->length = n;
    8876           0 :     pnew->hash = tmp->hash;
    8877           0 :     Py_DECREF(tmp);
    8878           0 :     return (PyObject *)pnew;
    8879             : }
    8880             : 
    8881             : PyDoc_STRVAR(unicode_doc,
    8882             :              "unicode(object='') -> unicode object\n\
    8883             : unicode(string[, encoding[, errors]]) -> unicode object\n\
    8884             : \n\
    8885             : Create a new Unicode object from the given encoded string.\n\
    8886             : encoding defaults to the current default string encoding.\n\
    8887             : errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
    8888             : 
    8889             : PyTypeObject PyUnicode_Type = {
    8890             :     PyVarObject_HEAD_INIT(&PyType_Type, 0)
    8891             :     "unicode",              /* tp_name */
    8892             :     sizeof(PyUnicodeObject),        /* tp_size */
    8893             :     0,                  /* tp_itemsize */
    8894             :     /* Slots */
    8895             :     (destructor)unicode_dealloc,    /* tp_dealloc */
    8896             :     0,                  /* tp_print */
    8897             :     0,                  /* tp_getattr */
    8898             :     0,                  /* tp_setattr */
    8899             :     0,                  /* tp_compare */
    8900             :     unicode_repr,           /* tp_repr */
    8901             :     &unicode_as_number,         /* tp_as_number */
    8902             :     &unicode_as_sequence,       /* tp_as_sequence */
    8903             :     &unicode_as_mapping,        /* tp_as_mapping */
    8904             :     (hashfunc) unicode_hash,        /* tp_hash*/
    8905             :     0,                  /* tp_call*/
    8906             :     (reprfunc) unicode_str,     /* tp_str */
    8907             :     PyObject_GenericGetAttr,        /* tp_getattro */
    8908             :     0,                  /* tp_setattro */
    8909             :     &unicode_as_buffer,         /* tp_as_buffer */
    8910             :     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
    8911             :     Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
    8912             :     unicode_doc,            /* tp_doc */
    8913             :     0,                  /* tp_traverse */
    8914             :     0,                  /* tp_clear */
    8915             :     PyUnicode_RichCompare,      /* tp_richcompare */
    8916             :     0,                  /* tp_weaklistoffset */
    8917             :     0,                  /* tp_iter */
    8918             :     0,                  /* tp_iternext */
    8919             :     unicode_methods,            /* tp_methods */
    8920             :     0,                  /* tp_members */
    8921             :     0,                  /* tp_getset */
    8922             :     &PyBaseString_Type,         /* tp_base */
    8923             :     0,                  /* tp_dict */
    8924             :     0,                  /* tp_descr_get */
    8925             :     0,                  /* tp_descr_set */
    8926             :     0,                  /* tp_dictoffset */
    8927             :     0,                  /* tp_init */
    8928             :     0,                  /* tp_alloc */
    8929             :     unicode_new,            /* tp_new */
    8930             :     PyObject_Del,           /* tp_free */
    8931             : };
    8932             : 
    8933             : /* Initialize the Unicode implementation */
    8934             : 
    8935           3 : void _PyUnicode_Init(void)
    8936             : {
    8937             :     /* XXX - move this array to unicodectype.c ? */
    8938           3 :     Py_UNICODE linebreak[] = {
    8939             :         0x000A, /* LINE FEED */
    8940             :         0x000D, /* CARRIAGE RETURN */
    8941             :         0x001C, /* FILE SEPARATOR */
    8942             :         0x001D, /* GROUP SEPARATOR */
    8943             :         0x001E, /* RECORD SEPARATOR */
    8944             :         0x0085, /* NEXT LINE */
    8945             :         0x2028, /* LINE SEPARATOR */
    8946             :         0x2029, /* PARAGRAPH SEPARATOR */
    8947             :     };
    8948             : 
    8949             :     /* Init the implementation */
    8950           3 :     if (!unicode_empty) {
    8951           3 :         unicode_empty = _PyUnicode_New(0);
    8952           3 :         if (!unicode_empty)
    8953           3 :             return;
    8954             :     }
    8955             : 
    8956           3 :     if (PyType_Ready(&PyUnicode_Type) < 0)
    8957           0 :         Py_FatalError("Can't initialize 'unicode'");
    8958             : 
    8959             :     /* initialize the linebreak bloom filter */
    8960           3 :     bloom_linebreak = make_bloom_mask(
    8961             :         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
    8962             :         );
    8963             : 
    8964           3 :     PyType_Ready(&EncodingMapType);
    8965             : 
    8966           3 :     if (PyType_Ready(&PyFieldNameIter_Type) < 0)
    8967           0 :         Py_FatalError("Can't initialize field name iterator type");
    8968             : 
    8969           3 :     if (PyType_Ready(&PyFormatterIter_Type) < 0)
    8970           0 :         Py_FatalError("Can't initialize formatter iter type");
    8971             : }
    8972             : 
    8973             : /* Finalize the Unicode implementation */
    8974             : 
    8975             : int
    8976           6 : PyUnicode_ClearFreeList(void)
    8977             : {
    8978           6 :     int freelist_size = numfree;
    8979             :     PyUnicodeObject *u;
    8980             : 
    8981          51 :     for (u = free_list; u != NULL;) {
    8982          39 :         PyUnicodeObject *v = u;
    8983          39 :         u = *(PyUnicodeObject **)u;
    8984          39 :         if (v->str)
    8985          36 :             PyObject_DEL(v->str);
    8986          39 :         Py_XDECREF(v->defenc);
    8987          39 :         PyObject_Del(v);
    8988          39 :         numfree--;
    8989             :     }
    8990           6 :     free_list = NULL;
    8991             :     assert(numfree == 0);
    8992           6 :     return freelist_size;
    8993             : }
    8994             : 
    8995             : void
    8996           3 : _PyUnicode_Fini(void)
    8997             : {
    8998             :     int i;
    8999             : 
    9000           3 :     Py_CLEAR(unicode_empty);
    9001             : 
    9002         771 :     for (i = 0; i < 256; i++)
    9003         768 :         Py_CLEAR(unicode_latin1[i]);
    9004             : 
    9005           3 :     (void)PyUnicode_ClearFreeList();
    9006           3 : }
    9007             : 
    9008             : #ifdef __cplusplus
    9009             : }
    9010             : #endif

Generated by: LCOV version 1.10