/home/andy/git/oilshell/oil/mycpp/gc_str.cc

Source (jump to first uncovered line)
#include "mycpp/gc_str.h"

#include <ctype.h>  // isalpha(), isdigit()
#include <stdarg.h>

#include <regex>

#include "mycpp/common.h"
#include "mycpp/gc_alloc.h"     // NewStr()
#include "mycpp/gc_builtins.h"  // StringToInt()
#include "mycpp/gc_list.h"      // join(), split() use it

GLOBAL_STR(kEmptyString, "");

static const std::regex gStrFmtRegex("([^%]*)(?:%(-?[0-9]*)(.))?");
static const int kMaxFmtWidth = 256;  // arbitrary...

int BigStr::find(BigStr* needle, int start, int end) {
  if (end == -1) {
    end = len(this);
  }
  int needle_len = len(needle);

  if (needle_len > (end - start)) {
    return -1;  // needle is too long to be found (Python behavior)
  }

  if (needle_len == 1) {
    char c = needle->data_[0];
    // For 'aaa'.find('a', 0, 1)
    // end = 1, needle_len = 1, last_start = 1 which means we go through once
    for (int i = start; i < end; ++i) {
      if (data_[i] == c) {
        return i;
      }
    }
  } else {
    // Note: this works for finding the empty string.  Empty string is found in
    // empty range like [5, 5), but not in [5, 4)

    // For 'aaa'.find('aa', 0, 2)
    // end = 2, needle_len = 2, last_start = 1 which means we go through once

    int last_start = end - needle_len + 1;
    // could use a smarter substring search algorithm
    for (int i = start; i < last_start; ++i) {
      if (memcmp(data_ + i, needle->data_, needle_len) == 0) {
        return i;
      }
    }
  }
  return -1;
}

int BigStr::rfind(BigStr* needle) {
  int length = len(this);
  DCHECK(len(needle) == 1);  // Oils usage
  char c = needle->data_[0];
  for (int i = length - 1; i >= 0; --i) {
    if (data_[i] == c) {
      return i;
    }
  }
  return -1;
}

bool BigStr::isdigit() {
  int n = len(this);
  if (n == 0) {
    return false;  // special case
  }
  for (int i = 0; i < n; ++i) {
    if (!::isdigit(data_[i])) {
      return false;
    }
  }
  return true;
}

bool BigStr::isalpha() {
  int n = len(this);
  if (n == 0) {
    return false;  // special case
  }
  for (int i = 0; i < n; ++i) {
    if (!::isalpha(data_[i])) {
      return false;
    }
  }
  return true;
}

// e.g. for osh/braces.py
bool BigStr::isupper() {
  int n = len(this);
  if (n == 0) {
    return false;  // special case
  }
  for (int i = 0; i < n; ++i) {
    if (!::isupper(data_[i])) {
      return false;
    }
  }
  return true;
}

bool BigStr::startswith(BigStr* s) {
  int n = len(s);
  if (n > len(this)) {
    return false;
  }
  return memcmp(data_, s->data_, n) == 0;
}

bool BigStr::endswith(BigStr* s) {
  int len_s = len(s);
  int len_this = len(this);
  if (len_s > len_this) {
    return false;
  }
  const char* start = data_ + len_this - len_s;
  return memcmp(start, s->data_, len_s) == 0;
}

// Get a string with one character
BigStr* BigStr::at(int i) {
  int length = len(this);
  if (i < 0) {
    i = length + i;
  }
  DCHECK(0 <= i);
  DCHECK(i < length);  // had a problem here!

  BigStr* result = NewStr(1);
  result->data_[0] = data_[i];
  return result;
}

// s[begin:]
BigStr* BigStr::slice(int begin) {
  return slice(begin, len(this));
}

// s[begin:end]
BigStr* BigStr::slice(int begin, int end) {
  int length = len(this);
  SLICE_ADJUST(begin, end, length);

  DCHECK(0 <= begin && begin <= length);
  DCHECK(0 <= end && end <= length);

  int new_len = end - begin;
  DCHECK(0 <= new_len && new_len <= length);

  BigStr* result = NewStr(new_len);  // has kEmptyString optimization
  memcpy(result->data_, data_ + begin, new_len);

  return result;
}

// Used by 'help' builtin and --help, neither of which translate yet.

List<BigStr*>* BigStr::splitlines(bool keep) {
  DCHECK(keep == true);
  FAIL(kNotImplemented);
}

BigStr* BigStr::upper() {
  int length = len(this);
  BigStr* result = NewStr(length);
  char* buffer = result->data();
  for (int char_index = 0; char_index < length; ++char_index) {
    buffer[char_index] = toupper(data_[char_index]);
  }
  return result;
}

BigStr* BigStr::lower() {
  int length = len(this);
  BigStr* result = NewStr(length);
  char* buffer = result->data();
  for (int char_index = 0; char_index < length; ++char_index) {
    buffer[char_index] = tolower(data_[char_index]);
  }
  return result;
}

BigStr* BigStr::ljust(int width, BigStr* fillchar) {
  DCHECK(len(fillchar) == 1);

  int length = len(this);
  int num_fill = width - length;
  if (num_fill < 0) {
    return this;
  } else {
    BigStr* result = NewStr(width);
    char c = fillchar->data_[0];
    memcpy(result->data_, data_, length);
    for (int i = length; i < width; ++i) {
      result->data_[i] = c;
    }
    return result;
  }
}

BigStr* BigStr::rjust(int width, BigStr* fillchar) {
  DCHECK(len(fillchar) == 1);

  int length = len(this);
  int num_fill = width - length;
  if (num_fill < 0) {
    return this;
  } else {
    BigStr* result = NewStr(width);
    char c = fillchar->data_[0];
    for (int i = 0; i < num_fill; ++i) {
      result->data_[i] = c;
    }
    memcpy(result->data_ + num_fill, data_, length);
    return result;
  }
}

BigStr* BigStr::replace(BigStr* old, BigStr* new_str) {
  // Use -1 as in python2: "aaaa".replace(-1) -> "AAAA"
  return replace(old, new_str, -1);
}

BigStr* BigStr::replace(BigStr* old, BigStr* new_str, int count) {
  // log("replacing %s with %s", old_data, new_str->data_);
  const char* old_data = old->data_;

  int this_len = len(this);
  int old_len = len(old);

  const char* last_possible = data_ + this_len - old_len;

  const char* p_this = data_;  // advances through 'this'

  // First pass: Calculate number of replacements, and hence new length
  int replace_count = 0;
  if (old_len == 0) {
    replace_count = this_len + 1;
    if (count > 0) {
      replace_count = min(replace_count, count);
    }
  } else {
    while (p_this <= last_possible) {
      if (replace_count != count &&  // limit replacements (if count != -1)
          memcmp(p_this, old_data, old_len) == 0) {  // equal
        replace_count++;
        p_this += old_len;
      } else {
        p_this++;
      }
    }
  }

  // log("replacements %d", replace_count);

  if (replace_count == 0) {
    return this;  // Reuse the string if there were no replacements
  }

  int new_str_len = len(new_str);
  int result_len =
      this_len - (replace_count * old_len) + (replace_count * new_str_len);

  BigStr* result = NewStr(result_len);

  const char* new_data = new_str->data_;
  const size_t new_len = new_str_len;

  // Second pass: Copy pieces into 'result'
  p_this = data_;                  // back to beginning
  char* p_result = result->data_;  // advances through 'result'
  replace_count = 0;

  if (old_len == 0) {
    // Should place new_str between each char in this
    while (p_this < last_possible && replace_count != count) {
      replace_count++;
      memcpy(p_result, new_data, new_len);  // Copy from new_str
      p_result += new_len;                  // Move past new_str

      // Write a char from this
      *p_result = *p_this;
      p_this++;
      p_result++;
    }

    if (replace_count != count) {
      // Write a copy of new_str at the end
      assert(p_this == last_possible);
      memcpy(p_result, new_data, new_len);
    } else if (p_this <= last_possible) {
      // Write the last part of string
      memcpy(p_result, p_this, data_ + this_len - p_this);
    }
  } else {
    while (p_this <= last_possible) {
      // Note: would be more efficient if we remembered the match positions
      if (replace_count != count &&  // limit replacements (if count != -1)
          memcmp(p_this, old_data, old_len) == 0) {  // equal
        memcpy(p_result, new_data, new_len);         // Copy from new_str
        replace_count++;
        p_result += new_len;
        p_this += old_len;
      } else {  // copy 1 byte
        *p_result = *p_this;
        p_result++;
        p_this++;
      }
    }
    memcpy(p_result, p_this, data_ + this_len - p_this);  // last part of string
  }

  return result;
}

enum class StripWhere {
  Left,
  Right,
  Both,
};

const int kWhitespace = -1;

bool OmitChar(int ch, int what) {
  if (what == kWhitespace) {
    // Intentional incompatibility with Python, where say \v is whitespace
    // '\v'.strip() == ''
    //
    // But it is consistent with the JSON spec [ \t\r\n] and the rules in
    // frontend/lexer_def.py
    //
    // Note that the YSH is separate, and Str => trim() respects Unicode.
    return IsAsciiWhitespace(ch);
  } else {
    return what == ch;
  }
}

// StripAny is modeled after CPython's do_strip() in stringobject.c, and can
// implement 6 functions:
//
//   strip / lstrip / rstrip
//   strip(char) / lstrip(char) / rstrip(char)
//
// Args:
//   where: which ends to strip from
//   what: kWhitespace, or an ASCII code 0-255

BigStr* StripAny(BigStr* s, StripWhere where, int what) {
  int length = len(s);
  const char* char_data = s->data();

  int i = 0;
  if (where != StripWhere::Right) {
    while (i < length && OmitChar(char_data[i], what)) {
      i++;
    }
  }

  int j = length;
  if (where != StripWhere::Left) {
    do {
      j--;
    } while (j >= i && OmitChar(char_data[j], what));
    j++;
  }

  if (i == j) {  // Optimization to reuse existing object
    return kEmptyString;
  }

  if (i == 0 && j == length) {  // nothing stripped
    return s;
  }

  // Note: makes a copy in leaky version, and will in GC version too
  int new_len = j - i;
  BigStr* result = NewStr(new_len);
  memcpy(result->data(), s->data() + i, new_len);
  return result;
}

BigStr* BigStr::strip() {
  return StripAny(this, StripWhere::Both, kWhitespace);
}

// Used for CommandSub in osh/cmd_exec.py
BigStr* BigStr::rstrip(BigStr* chars) {
  DCHECK(len(chars) == 1);
  int c = chars->data_[0];
  return StripAny(this, StripWhere::Right, c);
}

BigStr* BigStr::rstrip() {
  return StripAny(this, StripWhere::Right, kWhitespace);
}

BigStr* BigStr::lstrip(BigStr* chars) {
  DCHECK(len(chars) == 1);
  int c = chars->data_[0];
  return StripAny(this, StripWhere::Left, c);
}

BigStr* BigStr::lstrip() {
  return StripAny(this, StripWhere::Left, kWhitespace);
}

BigStr* BigStr::join(List<BigStr*>* items) {
  int length = 0;

  int num_parts = len(items);

  // " ".join([]) == ""
  if (num_parts == 0) {
    return kEmptyString;
  }

  // Common case
  // 'anything'.join(["foo"]) == "foo"
  if (num_parts == 1) {
    return items->at(0);
  }

  for (int i = 0; i < num_parts; ++i) {
    length += len(items->at(i));
  }
  // add length of all the separators
  int this_len = len(this);
  length += this_len * (num_parts - 1);

  BigStr* result = NewStr(length);
  char* p_result = result->data_;  // advances through

  for (int i = 0; i < num_parts; ++i) {
    // log("i %d", i);
    if (i != 0 && this_len) {             // optimize common case of ''.join()
      memcpy(p_result, data_, this_len);  // copy the separator
      p_result += this_len;
      // log("this_len %d", this_len);
    }

    int n = len(items->at(i));
    // log("n: %d", n);
    memcpy(p_result, items->at(i)->data_, n);  // copy the list item
    p_result += n;
  }

  return result;
}

static void AppendPart(List<BigStr*>* result, BigStr* s, int left, int right) {
  int new_len = right - left;
  BigStr* part;
  if (new_len == 0) {
    part = kEmptyString;
  } else {
    part = NewStr(new_len);
    memcpy(part->data_, s->data_ + left, new_len);
  }
  result->append(part);
}

// Split BigStr into List<BigStr*> of parts separated by 'sep'.
// The code structure is taken from CPython's Objects/stringlib/split.h.
List<BigStr*>* BigStr::split(BigStr* sep, int max_split) {
  DCHECK(sep != nullptr);
  DCHECK(len(sep) == 1);  // we can only split one char
  char sep_char = sep->data_[0];

  int str_len = len(this);
  if (str_len == 0) {
    // weird case consistent with Python: ''.split(':') == ['']
    return NewList<BigStr*>({kEmptyString});
  }

  List<BigStr*>* result = NewList<BigStr*>({});
  int left = 0;
  int right = 0;
  int num_parts = 0;  // 3 splits results in 4 parts

  while (right < str_len && num_parts < max_split) {
    // search for separator
    for (; right < str_len; right++) {
      if (data_[right] == sep_char) {
        AppendPart(result, this, left, right);
        right++;
        left = right;
        num_parts++;
        break;
      }
    }
  }
  if (num_parts == 0) {  // Optimization when there is no split
    result->append(this);
  } else if (left <= str_len) {  // Last part
    AppendPart(result, this, left, str_len);
  }

  return result;
}

List<BigStr*>* BigStr::split(BigStr* sep) {
  return this->split(sep, len(this));
}

unsigned BigStr::hash(HashFunc h) {
  if (!is_hashed_) {
    hash_ = h(data_, len(this)) >> 1;
    is_hashed_ = 1;
  }
  return hash_;
}

static inline BigStr* _StrFormat(const char* fmt, int fmt_len, va_list args) {
  auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
  auto end = std::cregex_iterator();

  char int_buf[kMaxFmtWidth];
  std::string buf;
  for (std::cregex_iterator it = beg; it != end; ++it) {
    const std::cmatch& match = *it;

    const std::csub_match& lit_m = match[1];
    DCHECK(lit_m.matched);
    const std::string& lit_s = lit_m.str();
    buf.append(lit_s);

    int width = 0;
    bool zero_pad = false;
    bool pad_back = false;
    const std::csub_match& width_m = match[2];
    const std::string& width_s = width_m.str();
    bool ok = false;
    if (width_m.matched && !width_s.empty()) {
      if (width_s[0] == '0') {
        zero_pad = true;
        DCHECK(width_s.size() > 1);
        ok = StringToInt(width_s.c_str() + 1, width_s.size() - 1, 10, &width);
        DCHECK(ok);
        (void)ok;  // silence unused var warning in opt
      } else {
        ok = StringToInt(width_s.c_str(), width_s.size(), 10, &width);
        DCHECK(ok);
      }
      if (width < 0) {
        pad_back = true;
        width *= -1;
      }
      DCHECK(0 <= width && width < kMaxFmtWidth);
    }

    char const* str_to_add = nullptr;
    int add_len = 0;
    const std::csub_match& code_m = match[3];
    const std::string& code_s = code_m.str();
    if (!code_m.matched) {
      DCHECK(!width_m.matched);  // python errors on invalid format operators
      break;
    }
    DCHECK(code_s.size() == 1);
    switch (code_s[0]) {
    case '%': {
      str_to_add = code_s.c_str();
      add_len = 1;
      break;
    }
    case 's': {
      BigStr* s = va_arg(args, BigStr*);
      // Check type unconditionally because mycpp doesn't always check it
      CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);

      str_to_add = s->data();
      add_len = len(s);
      zero_pad = false;  // python ignores the 0 directive for strings
      break;
    }
    case 'r': {
      BigStr* s = va_arg(args, BigStr*);
      // Check type unconditionally because mycpp doesn't always check it
      CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);

      s = repr(s);
      str_to_add = s->data();
      add_len = len(s);
      zero_pad = false;  // python ignores the 0 directive for strings
      break;
    }
    case 'd':  // fallthrough
    case 'o': {
      int d = va_arg(args, int);
      add_len = snprintf(int_buf, kMaxFmtWidth,
                         match.str().c_str() + lit_s.size(), d);
      DCHECK(add_len > 0);
      str_to_add = int_buf;
      break;
    }
    default:
      DCHECK(0);
      break;
    }
    DCHECK(str_to_add != nullptr);

    if (pad_back) {
      buf.append(str_to_add, add_len);
    }
    if (add_len < width) {
      for (int i = 0; i < width - add_len; ++i) {
        buf.push_back(zero_pad ? '0' : ' ');
      }
    }
    if (!pad_back) {
      buf.append(str_to_add, add_len);
    }
  }

  return StrFromC(buf.c_str(), buf.size());
}

BigStr* StrIter::Value() {  // similar to at()
  BigStr* result = NewStr(1);
  result->data_[0] = s_->data_[i_];
  DCHECK(result->data_[1] == '\0');
  return result;
}

BigStr* StrFormat(const char* fmt, ...) {
  va_list args;
  va_start(args, fmt);
  BigStr* ret = _StrFormat(fmt, strlen(fmt), args);
  va_end(args);
  return ret;
}

BigStr* StrFormat(BigStr* fmt, ...) {
  va_list args;
  va_start(args, fmt);
  BigStr* ret = _StrFormat(fmt->data(), len(fmt), args);
  va_end(args);
  return ret;
}

cpp

Coverage Report

Created: 2024-08-25 11:48

Line	Count	Source (jump to first uncovered line)
1		#include "mycpp/gc_str.h"
2
3		#include <ctype.h> // isalpha(), isdigit()
4		#include <stdarg.h>
5
6		#include <regex>
7
8		#include "mycpp/common.h"
9		#include "mycpp/gc_alloc.h" // NewStr()
10		#include "mycpp/gc_builtins.h" // StringToInt()
11		#include "mycpp/gc_list.h" // join(), split() use it
12
13		GLOBAL_STR(kEmptyString, "");
14
15		static const std::regex gStrFmtRegex("([^%])(?:%(-?[0-9])(.))?");
16		static const int kMaxFmtWidth = 256; // arbitrary...
17
18	133	int BigStr::find(BigStr* needle, int start, int end) {
19	133	if (end == -1) {
20	69	end = len(this);
21	69	}
22	133	int needle_len = len(needle);
23
24	133	if (needle_len > (end - start)) {
25	19	return -1; // needle is too long to be found (Python behavior)
26	19	}
27
28	114	if (needle_len == 1) {
29	52	char c = needle->data_[0];
30		// For 'aaa'.find('a', 0, 1)
31		// end = 1, needle_len = 1, last_start = 1 which means we go through once
32	221	for (int i = start; i < end; ++i) {
33	189	if (data_[i] == c) {
34	20	return i;
35	20	}
36	189	}
37	62	} else {
38		// Note: this works for finding the empty string. Empty string is found in
39		// empty range like [5, 5), but not in [5, 4)
40
41		// For 'aaa'.find('aa', 0, 2)
42		// end = 2, needle_len = 2, last_start = 1 which means we go through once
43
44	62	int last_start = end - needle_len + 1;
45		// could use a smarter substring search algorithm
46	125	for (int i = start; i < last_start; ++i) {
47	108	if (memcmp(data_ + i, needle->data_, needle_len) == 0) {
48	45	return i;
49	45	}
50	108	}
51	62	}
52	49	return -1;
53	114	}
54
55	10	int BigStr::rfind(BigStr* needle) {
56	10	int length = len(this);
57	10	DCHECK(len(needle) == 1); // Oils usage
58	0	char c = needle->data_[0];
59	50	for (int i = length - 1; i >= 0; --i) {
60	46	if (data_[i] == c) {
61	6	return i;
62	6	}
63	46	}
64	4	return -1;
65	10	}
66
67	51	bool BigStr::isdigit() {
68	51	int n = len(this);
69	51	if (n == 0) {
70	2	return false; // special case
71	2	}
72	65	for (int i = 0; i < n; ++i) {
73	49	if (!::isdigit(data_[i])) {
74	33	return false;
75	33	}
76	49	}
77	16	return true;
78	49	}
79
80	35	bool BigStr::isalpha() {
81	35	int n = len(this);
82	35	if (n == 0) {
83	0	return false; // special case
84	0	}
85	53	for (int i = 0; i < n; ++i) {
86	39	if (!::isalpha(data_[i])) {
87	21	return false;
88	21	}
89	39	}
90	14	return true;
91	35	}
92
93		// e.g. for osh/braces.py
94	8	bool BigStr::isupper() {
95	8	int n = len(this);
96	8	if (n == 0) {
97	2	return false; // special case
98	2	}
99	12	for (int i = 0; i < n; ++i) {
100	8	if (!::isupper(data_[i])) {
101	2	return false;
102	2	}
103	8	}
104	4	return true;
105	6	}
106
107	21	bool BigStr::startswith(BigStr* s) {
108	21	int n = len(s);
109	21	if (n > len(this)) {
110	0	return false;
111	0	}
112	21	return memcmp(data_, s->data_, n) == 0;
113	21	}
114
115	12	bool BigStr::endswith(BigStr* s) {
116	12	int len_s = len(s);
117	12	int len_this = len(this);
118	12	if (len_s > len_this) {
119	1	return false;
120	1	}
121	11	const char* start = data_ + len_this - len_s;
122	11	return memcmp(start, s->data_, len_s) == 0;
123	12	}
124
125		// Get a string with one character
126	95	BigStr* BigStr::at(int i) {
127	95	int length = len(this);
128	95	if (i < 0) {
129	2	i = length + i;
130	2	}
131	95	DCHECK(0 <= i);
132	95	DCHECK(i < length); // had a problem here!
133
134	0	BigStr* result = NewStr(1);
135	95	result->data_[0] = data_[i];
136	95	return result;
137	95	}
138
139		// s[begin:]
140	6	BigStr* BigStr::slice(int begin) {
141	6	return slice(begin, len(this));
142	6	}
143
144		// s[begin:end]
145	636	BigStr* BigStr::slice(int begin, int end) {
146	636	int length = len(this);
147	636	SLICE_ADJUST(begin, end, length);
148
149	636	DCHECK(0 <= begin && begin <= length);
150	636	DCHECK(0 <= end && end <= length);
151
152	0	int new_len = end - begin;
153	636	DCHECK(0 <= new_len && new_len <= length);
154
155	0	BigStr* result = NewStr(new_len); // has kEmptyString optimization
156	636	memcpy(result->data_, data_ + begin, new_len);
157
158	636	return result;
159	636	}
160
161		// Used by 'help' builtin and --help, neither of which translate yet.
162
163	0	List<BigStr> BigStr::splitlines(bool keep) {
164	0	DCHECK(keep == true);
165	0	FAIL(kNotImplemented);
166	0	}
167
168	9	BigStr* BigStr::upper() {
169	9	int length = len(this);
170	9	BigStr* result = NewStr(length);
171	9	char* buffer = result->data();
172	56	for (int char_index = 0; char_index < length; ++char_index) {
173	47	buffer[char_index] = toupper(data_[char_index]);
174	47	}
175	9	return result;
176	9	}
177
178	6	BigStr* BigStr::lower() {
179	6	int length = len(this);
180	6	BigStr* result = NewStr(length);
181	6	char* buffer = result->data();
182	38	for (int char_index = 0; char_index < length; ++char_index) {
183	32	buffer[char_index] = tolower(data_[char_index]);
184	32	}
185	6	return result;
186	6	}
187
188	30	BigStr* BigStr::ljust(int width, BigStr* fillchar) {
189	30	DCHECK(len(fillchar) == 1);
190
191	0	int length = len(this);
192	30	int num_fill = width - length;
193	30	if (num_fill < 0) {
194	10	return this;
195	20	} else {
196	20	BigStr* result = NewStr(width);
197	20	char c = fillchar->data_[0];
198	20	memcpy(result->data_, data_, length);
199	42	for (int i = length; i < width; ++i) {
200	22	result->data_[i] = c;
201	22	}
202	20	return result;
203	20	}
204	30	}
205
206	30	BigStr* BigStr::rjust(int width, BigStr* fillchar) {
207	30	DCHECK(len(fillchar) == 1);
208
209	0	int length = len(this);
210	30	int num_fill = width - length;
211	30	if (num_fill < 0) {
212	10	return this;
213	20	} else {
214	20	BigStr* result = NewStr(width);
215	20	char c = fillchar->data_[0];
216	42	for (int i = 0; i < num_fill; ++i) {
217	22	result->data_[i] = c;
218	22	}
219	20	memcpy(result->data_ + num_fill, data_, length);
220	20	return result;
221	20	}
222	30	}
223
224	729	BigStr* BigStr::replace(BigStr* old, BigStr* new_str) {
225		// Use -1 as in python2: "aaaa".replace(-1) -> "AAAA"
226	729	return replace(old, new_str, -1);
227	729	}
228
229	729	BigStr* BigStr::replace(BigStr* old, BigStr* new_str, int count) {
230		// log("replacing %s with %s", old_data, new_str->data_);
231	729	const char* old_data = old->data_;
232
233	729	int this_len = len(this);
234	729	int old_len = len(old);
235
236	729	const char* last_possible = data_ + this_len - old_len;
237
238	729	const char* p_this = data_; // advances through 'this'
239
240		// First pass: Calculate number of replacements, and hence new length
241	729	int replace_count = 0;
242	729	if (old_len == 0) {
243	0	replace_count = this_len + 1;
244	0	if (count > 0) {
245	0	replace_count = min(replace_count, count);
246	0	}
247	729	} else {
248	93.3k	while (p_this <= last_possible) {
249	92.6k	if (replace_count != count && // limit replacements (if count != -1)
250	92.6k	memcmp(p_this, old_data, old_len) == 0) { // equal
251	758	replace_count++;
252	758	p_this += old_len;
253	91.9k	} else {
254	91.9k	p_this++;
255	91.9k	}
256	92.6k	}
257	729	}
258
259		// log("replacements %d", replace_count);
260
261	729	if (replace_count == 0) {
262	4	return this; // Reuse the string if there were no replacements
263	4	}
264
265	725	int new_str_len = len(new_str);
266	725	int result_len =
267	725	this_len - (replace_count * old_len) + (replace_count * new_str_len);
268
269	725	BigStr* result = NewStr(result_len);
270
271	725	const char* new_data = new_str->data_;
272	725	const size_t new_len = new_str_len;
273
274		// Second pass: Copy pieces into 'result'
275	725	p_this = data_; // back to beginning
276	725	char* p_result = result->data_; // advances through 'result'
277	725	replace_count = 0;
278
279	725	if (old_len == 0) {
280		// Should place new_str between each char in this
281	0	while (p_this < last_possible && replace_count != count) {
282	0	replace_count++;
283	0	memcpy(p_result, new_data, new_len); // Copy from new_str
284	0	p_result += new_len; // Move past new_str
285
286		// Write a char from this
287	0	p_result = p_this;
288	0	p_this++;
289	0	p_result++;
290	0	}
291
292	0	if (replace_count != count) {
293		// Write a copy of new_str at the end
294	0	assert(p_this == last_possible);
295	0	memcpy(p_result, new_data, new_len);
296	0	} else if (p_this <= last_possible) {
297		// Write the last part of string
298	0	memcpy(p_result, p_this, data_ + this_len - p_this);
299	0	}
300	725	} else {
301	93.3k	while (p_this <= last_possible) {
302		// Note: would be more efficient if we remembered the match positions
303	92.6k	if (replace_count != count && // limit replacements (if count != -1)
304	92.6k	memcmp(p_this, old_data, old_len) == 0) { // equal
305	758	memcpy(p_result, new_data, new_len); // Copy from new_str
306	758	replace_count++;
307	758	p_result += new_len;
308	758	p_this += old_len;
309	91.8k	} else { // copy 1 byte
310	91.8k	p_result = p_this;
311	91.8k	p_result++;
312	91.8k	p_this++;
313	91.8k	}
314	92.6k	}
315	725	memcpy(p_result, p_this, data_ + this_len - p_this); // last part of string
316	725	}
317
318	0	return result;
319	729	}
320
321		enum class StripWhere {
322		Left,
323		Right,
324		Both,
325		};
326
327		const int kWhitespace = -1;
328
329	166	bool OmitChar(int ch, int what) {
330	166	if (what == kWhitespace) {
331		// Intentional incompatibility with Python, where say \v is whitespace
332		// '\v'.strip() == ''
333		//
334		// But it is consistent with the JSON spec [ \t\r\n] and the rules in
335		// frontend/lexer_def.py
336		//
337		// Note that the YSH is separate, and Str => trim() respects Unicode.
338	126	return IsAsciiWhitespace(ch);
339	126	} else {
340	40	return what == ch;
341	40	}
342	166	}
343
344		// StripAny is modeled after CPython's do_strip() in stringobject.c, and can
345		// implement 6 functions:
346		//
347		// strip / lstrip / rstrip
348		// strip(char) / lstrip(char) / rstrip(char)
349		//
350		// Args:
351		// where: which ends to strip from
352		// what: kWhitespace, or an ASCII code 0-255
353
354	64	BigStr* StripAny(BigStr* s, StripWhere where, int what) {
355	64	int length = len(s);
356	64	const char* char_data = s->data();
357
358	64	int i = 0;
359	64	if (where != StripWhere::Right) {
360	94	while (i < length && OmitChar(char_data[i], what)) {
361	54	i++;
362	54	}
363	40	}
364
365	64	int j = length;
366	64	if (where != StripWhere::Left) {
367	98	do {
368	98	j--;
369	98	} while (j >= i && OmitChar(char_data[j], what));
370	46	j++;
371	46	}
372
373	64	if (i == j) { // Optimization to reuse existing object
374	18	return kEmptyString;
375	18	}
376
377	46	if (i == 0 && j == length) { // nothing stripped
378	10	return s;
379	10	}
380
381		// Note: makes a copy in leaky version, and will in GC version too
382	36	int new_len = j - i;
383	36	BigStr* result = NewStr(new_len);
384	36	memcpy(result->data(), s->data() + i, new_len);
385	36	return result;
386	46	}
387
388	22	BigStr* BigStr::strip() {
389	22	return StripAny(this, StripWhere::Both, kWhitespace);
390	22	}
391
392		// Used for CommandSub in osh/cmd_exec.py
393	8	BigStr* BigStr::rstrip(BigStr* chars) {
394	8	DCHECK(len(chars) == 1);
395	0	int c = chars->data_[0];
396	8	return StripAny(this, StripWhere::Right, c);
397	8	}
398
399	16	BigStr* BigStr::rstrip() {
400	16	return StripAny(this, StripWhere::Right, kWhitespace);
401	16	}
402
403	8	BigStr* BigStr::lstrip(BigStr* chars) {
404	8	DCHECK(len(chars) == 1);
405	0	int c = chars->data_[0];
406	8	return StripAny(this, StripWhere::Left, c);
407	8	}
408
409	10	BigStr* BigStr::lstrip() {
410	10	return StripAny(this, StripWhere::Left, kWhitespace);
411	10	}
412
413	24	BigStr* BigStr::join(List<BigStr> items) {
414	24	int length = 0;
415
416	24	int num_parts = len(items);
417
418		// " ".join([]) == ""
419	24	if (num_parts == 0) {
420	9	return kEmptyString;
421	9	}
422
423		// Common case
424		// 'anything'.join(["foo"]) == "foo"
425	15	if (num_parts == 1) {
426	4	return items->at(0);
427	4	}
428
429	317	for (int i = 0; i < num_parts; ++i) {
430	306	length += len(items->at(i));
431	306	}
432		// add length of all the separators
433	11	int this_len = len(this);
434	11	length += this_len * (num_parts - 1);
435
436	11	BigStr* result = NewStr(length);
437	11	char* p_result = result->data_; // advances through
438
439	317	for (int i = 0; i < num_parts; ++i) {
440		// log("i %d", i);
441	306	if (i != 0 && this_len) { // optimize common case of ''.join()
442	16	memcpy(p_result, data_, this_len); // copy the separator
443	16	p_result += this_len;
444		// log("this_len %d", this_len);
445	16	}
446
447	306	int n = len(items->at(i));
448		// log("n: %d", n);
449	306	memcpy(p_result, items->at(i)->data_, n); // copy the list item
450	306	p_result += n;
451	306	}
452
453	11	return result;
454	15	}
455
456	98	static void AppendPart(List<BigStr> result, BigStr* s, int left, int right) {
457	98	int new_len = right - left;
458	98	BigStr* part;
459	98	if (new_len == 0) {
460	42	part = kEmptyString;
461	56	} else {
462	56	part = NewStr(new_len);
463	56	memcpy(part->data_, s->data_ + left, new_len);
464	56	}
465	98	result->append(part);
466	98	}
467
468		// Split BigStr into List<BigStr*> of parts separated by 'sep'.
469		// The code structure is taken from CPython's Objects/stringlib/split.h.
470	38	List<BigStr> BigStr::split(BigStr* sep, int max_split) {
471	38	DCHECK(sep != nullptr);
472	38	DCHECK(len(sep) == 1); // we can only split one char
473	0	char sep_char = sep->data_[0];
474
475	38	int str_len = len(this);
476	38	if (str_len == 0) {
477		// weird case consistent with Python: ''.split(':') == ['']
478	4	return NewList<BigStr*>({kEmptyString});
479	4	}
480
481	34	List<BigStr> result = NewList<BigStr*>({});
482	34	int left = 0;
483	34	int right = 0;
484	34	int num_parts = 0; // 3 splits results in 4 parts
485
486	114	while (right < str_len && num_parts < max_split) {
487		// search for separator
488	186	for (; right < str_len; right++) {
489	174	if (data_[right] == sep_char) {
490	68	AppendPart(result, this, left, right);
491	68	right++;
492	68	left = right;
493	68	num_parts++;
494	68	break;
495	68	}
496	174	}
497	80	}
498	34	if (num_parts == 0) { // Optimization when there is no split
499	4	result->append(this);
500	30	} else if (left <= str_len) { // Last part
501	30	AppendPart(result, this, left, str_len);
502	30	}
503
504	34	return result;
505	38	}
506
507	32	List<BigStr> BigStr::split(BigStr* sep) {
508	32	return this->split(sep, len(this));
509	32	}
510
511	940	unsigned BigStr::hash(HashFunc h) {
512	940	if (!is_hashed_) {
513	305	hash_ = h(data_, len(this)) >> 1;
514	305	is_hashed_ = 1;
515	305	}
516	940	return hash_;
517	940	}
518
519	599	static inline BigStr* _StrFormat(const char* fmt, int fmt_len, va_list args) {
520	599	auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
521	599	auto end = std::cregex_iterator();
522
523	599	char int_buf[kMaxFmtWidth];
524	599	std::string buf;
525	1.60k	for (std::cregex_iterator it = beg; it != end; ++it) {
526	1.60k	const std::cmatch& match = *it;
527
528	1.60k	const std::csub_match& lit_m = match[1];
529	1.60k	DCHECK(lit_m.matched);
530	0	const std::string& lit_s = lit_m.str();
531	1.60k	buf.append(lit_s);
532
533	1.60k	int width = 0;
534	1.60k	bool zero_pad = false;
535	1.60k	bool pad_back = false;
536	1.60k	const std::csub_match& width_m = match[2];
537	1.60k	const std::string& width_s = width_m.str();
538	1.60k	bool ok = false;
539	1.60k	if (width_m.matched && !width_s.empty()) {
540	23	if (width_s[0] == '0') {
541	5	zero_pad = true;
542	5	DCHECK(width_s.size() > 1);
543	0	ok = StringToInt(width_s.c_str() + 1, width_s.size() - 1, 10, &width);
544	5	DCHECK(ok);
545	0	(void)ok; // silence unused var warning in opt
546	18	} else {
547	18	ok = StringToInt(width_s.c_str(), width_s.size(), 10, &width);
548	18	DCHECK(ok);
549	18	}
550	23	if (width < 0) {
551	2	pad_back = true;
552	2	width *= -1;
553	2	}
554	23	DCHECK(0 <= width && width < kMaxFmtWidth);
555	23	}
556
557	0	char const* str_to_add = nullptr;
558	1.60k	int add_len = 0;
559	1.60k	const std::csub_match& code_m = match[3];
560	1.60k	const std::string& code_s = code_m.str();
561	1.60k	if (!code_m.matched) {
562	599	DCHECK(!width_m.matched); // python errors on invalid format operators
563	0	break;
564	599	}
565	1.00k	DCHECK(code_s.size() == 1);
566	0	switch (code_s[0]) {
567	14	case '%': {
568	14	str_to_add = code_s.c_str();
569	14	add_len = 1;
570	14	break;
571	0	}
572	499	case 's': {
573	499	BigStr* s = va_arg(args, BigStr*);
574		// Check type unconditionally because mycpp doesn't always check it
575	499	CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
576
577	0	str_to_add = s->data();
578	499	add_len = len(s);
579	499	zero_pad = false; // python ignores the 0 directive for strings
580	499	break;
581	0	}
582	32	case 'r': {
583	32	BigStr* s = va_arg(args, BigStr*);
584		// Check type unconditionally because mycpp doesn't always check it
585	32	CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
586
587	0	s = repr(s);
588	32	str_to_add = s->data();
589	32	add_len = len(s);
590	32	zero_pad = false; // python ignores the 0 directive for strings
591	32	break;
592	0	}
593	454	case 'd': // fallthrough
594	463	case 'o': {
595	463	int d = va_arg(args, int);
596	463	add_len = snprintf(int_buf, kMaxFmtWidth,
597	463	match.str().c_str() + lit_s.size(), d);
598	463	DCHECK(add_len > 0);
599	0	str_to_add = int_buf;
600	463	break;
601	454	}
602	0	default:
603	0	DCHECK(0);
604	0	break;
605	1.00k	}
606	1.00k	DCHECK(str_to_add != nullptr);
607
608	1.00k	if (pad_back) {
609	2	buf.append(str_to_add, add_len);
610	2	}
611	1.00k	if (add_len < width) {
612	42	for (int i = 0; i < width - add_len; ++i) {
613	36	buf.push_back(zero_pad ? '0' : ' ');
614	36	}
615	6	}
616	1.00k	if (!pad_back) {
617	1.00k	buf.append(str_to_add, add_len);
618	1.00k	}
619	1.00k	}
620
621	599	return StrFromC(buf.c_str(), buf.size());
622	599	}
623
624	43	BigStr* StrIter::Value() { // similar to at()
625	43	BigStr* result = NewStr(1);
626	43	result->data_[0] = s_->data_[i_];
627	43	DCHECK(result->data_[1] == '\0');
628	0	return result;
629	43	}
630
631	592	BigStr* StrFormat(const char* fmt, ...) {
632	592	va_list args;
633	592	va_start(args, fmt);
634	592	BigStr* ret = _StrFormat(fmt, strlen(fmt), args);
635	592	va_end(args);
636	592	return ret;
637	592	}
638
639	7	BigStr* StrFormat(BigStr* fmt, ...) {
640	7	va_list args;
641	7	va_start(args, fmt);
642	7	BigStr* ret = _StrFormat(fmt->data(), len(fmt), args);
643	7	va_end(args);
644	7	return ret;
645	7	}