LCOV - code coverage report
Current view: top level - Objects/stringlib - split.h (source / functions) Hit Total Coverage
Test: CPython lcov report Lines: 58 182 31.9 %
Date: 2017-04-19 Functions: 4 7 57.1 %

          Line data    Source code
       1             : /* stringlib: split implementation */
       2             : 
       3             : #ifndef STRINGLIB_SPLIT_H
       4             : #define STRINGLIB_SPLIT_H
       5             : 
       6             : #ifndef STRINGLIB_FASTSEARCH_H
       7             : #error must include "stringlib/fastsearch.h" before including this module
       8             : #endif
       9             : 
      10             : /* Overallocate the initial list to reduce the number of reallocs for small
      11             :    split sizes.  Eg, "A A A A A A A A A A".split() (10 elements) has three
      12             :    resizes, to sizes 4, 8, then 16.  Most observed string splits are for human
      13             :    text (roughly 11 words per line) and field delimited data (usually 1-10
      14             :    fields).  For large strings the split algorithms are bandwidth limited
      15             :    so increasing the preallocation likely will not improve things.*/
      16             : 
      17             : #define MAX_PREALLOC 12
      18             : 
      19             : /* 5 splits gives 6 elements */
      20             : #define PREALLOC_SIZE(maxsplit) \
      21             :     (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
      22             : 
      23             : #define SPLIT_APPEND(data, left, right)         \
      24             :     sub = STRINGLIB_NEW((data) + (left),        \
      25             :                         (right) - (left));      \
      26             :     if (sub == NULL)                            \
      27             :         goto onError;                           \
      28             :     if (PyList_Append(list, sub)) {             \
      29             :         Py_DECREF(sub);                         \
      30             :         goto onError;                           \
      31             :     }                                           \
      32             :     else                                        \
      33             :         Py_DECREF(sub);
      34             : 
      35             : #define SPLIT_ADD(data, left, right) {          \
      36             :     sub = STRINGLIB_NEW((data) + (left),        \
      37             :                         (right) - (left));      \
      38             :     if (sub == NULL)                            \
      39             :         goto onError;                           \
      40             :     if (count < MAX_PREALLOC) {                 \
      41             :         PyList_SET_ITEM(list, count, sub);      \
      42             :     } else {                                    \
      43             :         if (PyList_Append(list, sub)) {         \
      44             :             Py_DECREF(sub);                     \
      45             :             goto onError;                       \
      46             :         }                                       \
      47             :         else                                    \
      48             :             Py_DECREF(sub);                     \
      49             :     }                                           \
      50             :     count++; }
      51             : 
      52             : 
      53             : /* Always force the list to the expected size. */
      54             : #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
      55             : 
      56             : Py_LOCAL_INLINE(PyObject *)
      57          51 : stringlib_split_whitespace(PyObject* str_obj,
      58             :                            const STRINGLIB_CHAR* str, Py_ssize_t str_len,
      59             :                            Py_ssize_t maxcount)
      60             : {
      61          51 :     Py_ssize_t i, j, count=0;
      62          51 :     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
      63             :     PyObject *sub;
      64             : 
      65          51 :     if (list == NULL)
      66           0 :         return NULL;
      67             : 
      68          51 :     i = j = 0;
      69         381 :     while (maxcount-- > 0) {
      70         900 :         while (i < str_len && STRINGLIB_ISSPACE(str[i]))
      71         240 :             i++;
      72         330 :         if (i == str_len) break;
      73         279 :         j = i; i++;
      74        1935 :         while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
      75        1377 :             i++;
      76             : #ifndef STRINGLIB_MUTABLE
      77         279 :         if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
      78             :             /* No whitespace in str_obj, so just use it as list[0] */
      79           0 :             Py_INCREF(str_obj);
      80           0 :             PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
      81           0 :             count++;
      82           0 :             break;
      83             :         }
      84             : #endif
      85         279 :         SPLIT_ADD(str, j, i);
      86             :     }
      87             : 
      88          51 :     if (i < str_len) {
      89             :         /* Only occurs when maxcount was reached */
      90             :         /* Skip any remaining whitespace and copy to end of string */
      91           0 :         while (i < str_len && STRINGLIB_ISSPACE(str[i]))
      92           0 :             i++;
      93           0 :         if (i != str_len)
      94           0 :             SPLIT_ADD(str, i, str_len);
      95             :     }
      96          51 :     FIX_PREALLOC_SIZE(list);
      97          51 :     return list;
      98             : 
      99             :   onError:
     100           0 :     Py_DECREF(list);
     101           0 :     return NULL;
     102             : }
     103             : 
     104             : Py_LOCAL_INLINE(PyObject *)
     105          27 : stringlib_split_char(PyObject* str_obj,
     106             :                      const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     107             :                      const STRINGLIB_CHAR ch,
     108             :                      Py_ssize_t maxcount)
     109             : {
     110          27 :     Py_ssize_t i, j, count=0;
     111          27 :     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
     112             :     PyObject *sub;
     113             : 
     114          27 :     if (list == NULL)
     115           0 :         return NULL;
     116             : 
     117          27 :     i = j = 0;
     118         135 :     while ((j < str_len) && (maxcount-- > 0)) {
     119         378 :         for(; j < str_len; j++) {
     120             :             /* I found that using memchr makes no difference */
     121         351 :             if (str[j] == ch) {
     122          54 :                 SPLIT_ADD(str, i, j);
     123          54 :                 i = j = j + 1;
     124          54 :                 break;
     125             :             }
     126             :         }
     127             :     }
     128             : #ifndef STRINGLIB_MUTABLE
     129          27 :     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
     130             :         /* ch not in str_obj, so just use str_obj as list[0] */
     131          18 :         Py_INCREF(str_obj);
     132          18 :         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     133          18 :         count++;
     134             :     } else
     135             : #endif
     136           9 :     if (i <= str_len) {
     137           9 :         SPLIT_ADD(str, i, str_len);
     138             :     }
     139          27 :     FIX_PREALLOC_SIZE(list);
     140          27 :     return list;
     141             : 
     142             :   onError:
     143           0 :     Py_DECREF(list);
     144           0 :     return NULL;
     145             : }
     146             : 
     147             : Py_LOCAL_INLINE(PyObject *)
     148          27 : stringlib_split(PyObject* str_obj,
     149             :                 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     150             :                 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
     151             :                 Py_ssize_t maxcount)
     152             : {
     153          27 :     Py_ssize_t i, j, pos, count=0;
     154             :     PyObject *list, *sub;
     155             : 
     156          27 :     if (sep_len == 0) {
     157           0 :         PyErr_SetString(PyExc_ValueError, "empty separator");
     158           0 :         return NULL;
     159             :     }
     160          27 :     else if (sep_len == 1)
     161          27 :         return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount);
     162             : 
     163           0 :     list = PyList_New(PREALLOC_SIZE(maxcount));
     164           0 :     if (list == NULL)
     165           0 :         return NULL;
     166             : 
     167           0 :     i = j = 0;
     168           0 :     while (maxcount-- > 0) {
     169           0 :         pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
     170           0 :         if (pos < 0)
     171           0 :             break;
     172           0 :         j = i + pos;
     173           0 :         SPLIT_ADD(str, i, j);
     174           0 :         i = j + sep_len;
     175             :     }
     176             : #ifndef STRINGLIB_MUTABLE
     177           0 :     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
     178             :         /* No match in str_obj, so just use it as list[0] */
     179           0 :         Py_INCREF(str_obj);
     180           0 :         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     181           0 :         count++;
     182             :     } else
     183             : #endif
     184             :     {
     185           0 :         SPLIT_ADD(str, i, str_len);
     186             :     }
     187           0 :     FIX_PREALLOC_SIZE(list);
     188           0 :     return list;
     189             : 
     190             :   onError:
     191           0 :     Py_DECREF(list);
     192           0 :     return NULL;
     193             : }
     194             : 
     195             : Py_LOCAL_INLINE(PyObject *)
     196           0 : stringlib_rsplit_whitespace(PyObject* str_obj,
     197             :                             const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     198             :                             Py_ssize_t maxcount)
     199             : {
     200           0 :     Py_ssize_t i, j, count=0;
     201           0 :     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
     202             :     PyObject *sub;
     203             : 
     204           0 :     if (list == NULL)
     205           0 :         return NULL;
     206             : 
     207           0 :     i = j = str_len - 1;
     208           0 :     while (maxcount-- > 0) {
     209           0 :         while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
     210           0 :             i--;
     211           0 :         if (i < 0) break;
     212           0 :         j = i; i--;
     213           0 :         while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
     214           0 :             i--;
     215             : #ifndef STRINGLIB_MUTABLE
     216           0 :         if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
     217             :             /* No whitespace in str_obj, so just use it as list[0] */
     218           0 :             Py_INCREF(str_obj);
     219           0 :             PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     220           0 :             count++;
     221           0 :             break;
     222             :         }
     223             : #endif
     224           0 :         SPLIT_ADD(str, i + 1, j + 1);
     225             :     }
     226             : 
     227           0 :     if (i >= 0) {
     228             :         /* Only occurs when maxcount was reached */
     229             :         /* Skip any remaining whitespace and copy to beginning of string */
     230           0 :         while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
     231           0 :             i--;
     232           0 :         if (i >= 0)
     233           0 :             SPLIT_ADD(str, 0, i + 1);
     234             :     }
     235           0 :     FIX_PREALLOC_SIZE(list);
     236           0 :     if (PyList_Reverse(list) < 0)
     237           0 :         goto onError;
     238           0 :     return list;
     239             : 
     240             :   onError:
     241           0 :     Py_DECREF(list);
     242           0 :     return NULL;
     243             : }
     244             : 
     245             : Py_LOCAL_INLINE(PyObject *)
     246           0 : stringlib_rsplit_char(PyObject* str_obj,
     247             :                       const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     248             :                       const STRINGLIB_CHAR ch,
     249             :                       Py_ssize_t maxcount)
     250             : {
     251           0 :     Py_ssize_t i, j, count=0;
     252           0 :     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
     253             :     PyObject *sub;
     254             : 
     255           0 :     if (list == NULL)
     256           0 :         return NULL;
     257             : 
     258           0 :     i = j = str_len - 1;
     259           0 :     while ((i >= 0) && (maxcount-- > 0)) {
     260           0 :         for(; i >= 0; i--) {
     261           0 :             if (str[i] == ch) {
     262           0 :                 SPLIT_ADD(str, i + 1, j + 1);
     263           0 :                 j = i = i - 1;
     264           0 :                 break;
     265             :             }
     266             :         }
     267             :     }
     268             : #ifndef STRINGLIB_MUTABLE
     269           0 :     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
     270             :         /* ch not in str_obj, so just use str_obj as list[0] */
     271           0 :         Py_INCREF(str_obj);
     272           0 :         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     273           0 :         count++;
     274             :     } else
     275             : #endif
     276           0 :     if (j >= -1) {
     277           0 :         SPLIT_ADD(str, 0, j + 1);
     278             :     }
     279           0 :     FIX_PREALLOC_SIZE(list);
     280           0 :     if (PyList_Reverse(list) < 0)
     281           0 :         goto onError;
     282           0 :     return list;
     283             : 
     284             :   onError:
     285           0 :     Py_DECREF(list);
     286           0 :     return NULL;
     287             : }
     288             : 
     289             : Py_LOCAL_INLINE(PyObject *)
     290           0 : stringlib_rsplit(PyObject* str_obj,
     291             :                  const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     292             :                  const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
     293             :                  Py_ssize_t maxcount)
     294             : {
     295           0 :     Py_ssize_t j, pos, count=0;
     296             :     PyObject *list, *sub;
     297             : 
     298           0 :     if (sep_len == 0) {
     299           0 :         PyErr_SetString(PyExc_ValueError, "empty separator");
     300           0 :         return NULL;
     301             :     }
     302           0 :     else if (sep_len == 1)
     303           0 :         return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount);
     304             : 
     305           0 :     list = PyList_New(PREALLOC_SIZE(maxcount));
     306           0 :     if (list == NULL)
     307           0 :         return NULL;
     308             : 
     309           0 :     j = str_len;
     310           0 :     while (maxcount-- > 0) {
     311           0 :         pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH);
     312           0 :         if (pos < 0)
     313           0 :             break;
     314           0 :         SPLIT_ADD(str, pos + sep_len, j);
     315           0 :         j = pos;
     316             :     }
     317             : #ifndef STRINGLIB_MUTABLE
     318           0 :     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
     319             :         /* No match in str_obj, so just use it as list[0] */
     320           0 :         Py_INCREF(str_obj);
     321           0 :         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
     322           0 :         count++;
     323             :     } else
     324             : #endif
     325             :     {
     326           0 :         SPLIT_ADD(str, 0, j);
     327             :     }
     328           0 :     FIX_PREALLOC_SIZE(list);
     329           0 :     if (PyList_Reverse(list) < 0)
     330           0 :         goto onError;
     331           0 :     return list;
     332             : 
     333             :   onError:
     334           0 :     Py_DECREF(list);
     335           0 :     return NULL;
     336             : }
     337             : 
     338             : Py_LOCAL_INLINE(PyObject *)
     339           9 : stringlib_splitlines(PyObject* str_obj,
     340             :                      const STRINGLIB_CHAR* str, Py_ssize_t str_len,
     341             :                      int keepends)
     342             : {
     343             :     /* This does not use the preallocated list because splitlines is
     344             :        usually run with hundreds of newlines.  The overhead of
     345             :        switching between PyList_SET_ITEM and append causes about a
     346             :        2-3% slowdown for that common case.  A smarter implementation
     347             :        could move the if check out, so the SET_ITEMs are done first
     348             :        and the appends only done when the prealloc buffer is full.
     349             :        That's too much work for little gain.*/
     350             : 
     351             :     register Py_ssize_t i;
     352             :     register Py_ssize_t j;
     353           9 :     PyObject *list = PyList_New(0);
     354             :     PyObject *sub;
     355             : 
     356           9 :     if (list == NULL)
     357           0 :         return NULL;
     358             : 
     359         771 :     for (i = j = 0; i < str_len; ) {
     360             :         Py_ssize_t eol;
     361             : 
     362             :         /* Find a line and append it */
     363       29244 :         while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
     364       27732 :             i++;
     365             : 
     366             :         /* Skip the line break reading CRLF as one line break */
     367         756 :         eol = i;
     368         756 :         if (i < str_len) {
     369         753 :             if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
     370           0 :                 i += 2;
     371             :             else
     372         753 :                 i++;
     373         753 :             if (keepends)
     374           0 :                 eol = i;
     375             :         }
     376             : #ifndef STRINGLIB_MUTABLE
     377         756 :         if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
     378             :             /* No linebreak in str_obj, so just use it as list[0] */
     379           3 :             if (PyList_Append(list, str_obj))
     380           0 :                 goto onError;
     381           3 :             break;
     382             :         }
     383             : #endif
     384         753 :         SPLIT_APPEND(str, j, eol);
     385         753 :         j = i;
     386             :     }
     387           9 :     return list;
     388             : 
     389             :   onError:
     390           0 :     Py_DECREF(list);
     391           0 :     return NULL;
     392             : }
     393             : 
     394             : #endif

Generated by: LCOV version 1.10