Repository URL to install this package:
|
Version:
1.4.3 ▾
|
/*
Copyright (c) 2012, Lambda Foundry, Inc., except where noted
Incorporates components of WarrenWeckesser/textreader, licensed under 3-clause
BSD
See LICENSE for the license
*/
/*
Low-level ascii-file processing for pandas. Combines some elements from
Python's built-in csv module and Warren Weckesser's textreader project on
GitHub. See Python Software Foundation License and BSD licenses for these.
*/
#include "tokenizer.h"
#include <ctype.h>
#include <float.h>
#include <math.h>
#include "../headers/portable.h"
void coliter_setup(coliter_t *self, parser_t *parser, int64_t i,
int64_t start) {
// column i, starting at 0
self->words = parser->words;
self->col = i;
self->line_start = parser->line_start + start;
}
static void free_if_not_null(void **ptr) {
TRACE(("free_if_not_null %p\n", *ptr))
if (*ptr != NULL) {
free(*ptr);
*ptr = NULL;
}
}
/*
Parser / tokenizer
*/
static void *grow_buffer(void *buffer, uint64_t length, uint64_t *capacity,
int64_t space, int64_t elsize, int *error) {
uint64_t cap = *capacity;
void *newbuffer = buffer;
// Can we fit potentially nbytes tokens (+ null terminators) in the stream?
while ((length + space >= cap) && (newbuffer != NULL)) {
cap = cap ? cap << 1 : 2;
buffer = newbuffer;
newbuffer = realloc(newbuffer, elsize * cap);
}
if (newbuffer == NULL) {
// realloc failed so don't change *capacity, set *error to errno
// and return the last good realloc'd buffer so it can be freed
*error = errno;
newbuffer = buffer;
} else {
// realloc worked, update *capacity and set *error to 0
// sigh, multiple return values
*capacity = cap;
*error = 0;
}
return newbuffer;
}
void parser_set_default_options(parser_t *self) {
self->decimal = '.';
self->sci = 'E';
// For tokenization
self->state = START_RECORD;
self->delimiter = ','; // XXX
self->delim_whitespace = 0;
self->doublequote = 0;
self->quotechar = '"';
self->escapechar = 0;
self->lineterminator = '\0'; /* NUL->standard logic */
self->skipinitialspace = 0;
self->quoting = QUOTE_MINIMAL;
self->allow_embedded_newline = 1;
self->expected_fields = -1;
self->on_bad_lines = ERROR;
self->commentchar = '#';
self->thousands = '\0';
self->skipset = NULL;
self->skipfunc = NULL;
self->skip_first_N_rows = -1;
self->skip_footer = 0;
}
parser_t *parser_new() { return (parser_t *)calloc(1, sizeof(parser_t)); }
int parser_clear_data_buffers(parser_t *self) {
free_if_not_null((void *)&self->stream);
free_if_not_null((void *)&self->words);
free_if_not_null((void *)&self->word_starts);
free_if_not_null((void *)&self->line_start);
free_if_not_null((void *)&self->line_fields);
return 0;
}
int parser_cleanup(parser_t *self) {
int status = 0;
// XXX where to put this
free_if_not_null((void *)&self->error_msg);
free_if_not_null((void *)&self->warn_msg);
if (self->skipset != NULL) {
kh_destroy_int64((kh_int64_t *)self->skipset);
self->skipset = NULL;
}
if (parser_clear_data_buffers(self) < 0) {
status = -1;
}
if (self->cb_cleanup != NULL) {
if (self->cb_cleanup(self->source) < 0) {
status = -1;
}
self->cb_cleanup = NULL;
}
return status;
}
int parser_init(parser_t *self) {
int64_t sz;
/*
Initialize data buffers
*/
self->stream = NULL;
self->words = NULL;
self->word_starts = NULL;
self->line_start = NULL;
self->line_fields = NULL;
self->error_msg = NULL;
self->warn_msg = NULL;
// token stream
self->stream = malloc(STREAM_INIT_SIZE * sizeof(char));
if (self->stream == NULL) {
parser_cleanup(self);
return PARSER_OUT_OF_MEMORY;
}
self->stream_cap = STREAM_INIT_SIZE;
self->stream_len = 0;
// word pointers and metadata
sz = STREAM_INIT_SIZE / 10;
sz = sz ? sz : 1;
self->words = malloc(sz * sizeof(char *));
self->word_starts = malloc(sz * sizeof(int64_t));
self->max_words_cap = sz;
self->words_cap = sz;
self->words_len = 0;
// line pointers and metadata
self->line_start = malloc(sz * sizeof(int64_t));
self->line_fields = malloc(sz * sizeof(int64_t));
self->lines_cap = sz;
self->lines = 0;
self->file_lines = 0;
if (self->stream == NULL || self->words == NULL ||
self->word_starts == NULL || self->line_start == NULL ||
self->line_fields == NULL) {
parser_cleanup(self);
return PARSER_OUT_OF_MEMORY;
}
/* amount of bytes buffered */
self->datalen = 0;
self->datapos = 0;
self->line_start[0] = 0;
self->line_fields[0] = 0;
self->pword_start = self->stream;
self->word_start = 0;
self->state = START_RECORD;
self->error_msg = NULL;
self->warn_msg = NULL;
self->commentchar = '\0';
return 0;
}
void parser_free(parser_t *self) {
// opposite of parser_init
parser_cleanup(self);
}
void parser_del(parser_t *self) {
free(self);
}
static int make_stream_space(parser_t *self, size_t nbytes) {
uint64_t i, cap, length;
int status;
void *orig_ptr, *newptr;
// Can we fit potentially nbytes tokens (+ null terminators) in the stream?
/*
TOKEN STREAM
*/
orig_ptr = (void *)self->stream;
TRACE(
("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n",
nbytes))
self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len,
&self->stream_cap, nbytes * 2,
sizeof(char), &status);
TRACE(
("make_stream_space: self->stream=%p, self->stream_len = %zu, "
"self->stream_cap=%zu, status=%zu\n",
self->stream, self->stream_len, self->stream_cap, status))
if (status != 0) {
return PARSER_OUT_OF_MEMORY;
}
// realloc sets errno when moving buffer?
if (self->stream != orig_ptr) {
self->pword_start = self->stream + self->word_start;
for (i = 0; i < self->words_len; ++i) {
self->words[i] = self->stream + self->word_starts[i];
}
}
/*
WORD VECTORS
*/
cap = self->words_cap;
/**
* If we are reading in chunks, we need to be aware of the maximum number
* of words we have seen in previous chunks (self->max_words_cap), so
* that way, we can properly allocate when reading subsequent ones.
*
* Otherwise, we risk a buffer overflow if we mistakenly under-allocate
* just because a recent chunk did not have as many words.
*/
if (self->words_len + nbytes < self->max_words_cap) {
length = self->max_words_cap - nbytes - 1;
} else {
length = self->words_len;
}
self->words =
(char **)grow_buffer((void *)self->words, length,
&self->words_cap, nbytes,
sizeof(char *), &status);
TRACE(
("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, "
"%d)\n",
self->words_len, self->words_cap, nbytes, status))
if (status != 0) {
return PARSER_OUT_OF_MEMORY;
}
// realloc took place
if (cap != self->words_cap) {
TRACE(
("make_stream_space: cap != self->words_cap, nbytes = %d, "
"self->words_cap=%d\n",
nbytes, self->words_cap))
newptr = realloc((void *)self->word_starts,
sizeof(int64_t) * self->words_cap);
if (newptr == NULL) {
return PARSER_OUT_OF_MEMORY;
} else {
self->word_starts = (int64_t *)newptr;
}
}
/*
LINE VECTORS
*/
cap = self->lines_cap;
self->line_start =
(int64_t *)grow_buffer((void *)self->line_start, self->lines + 1,
&self->lines_cap, nbytes,
sizeof(int64_t), &status);
TRACE((
"make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n",
self->lines + 1, self->lines_cap, nbytes, status))
if (status != 0) {
return PARSER_OUT_OF_MEMORY;
}
// realloc took place
if (cap != self->lines_cap) {
TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n",
nbytes))
newptr = realloc((void *)self->line_fields,
sizeof(int64_t) * self->lines_cap);
if (newptr == NULL) {
return PARSER_OUT_OF_MEMORY;
} else {
self->line_fields = (int64_t *)newptr;
}
}
return 0;
}
static int push_char(parser_t *self, char c) {
TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n",
self->stream_len + 1, c, self->stream_cap))
if (self->stream_len >= self->stream_cap) {
TRACE(
("push_char: ERROR!!! self->stream_len(%d) >= "
"self->stream_cap(%d)\n",
self->stream_len, self->stream_cap))
int64_t bufsize = 100;
self->error_msg = malloc(bufsize);
snprintf(self->error_msg, bufsize,
"Buffer overflow caught - possible malformed input file.\n");
return PARSER_OUT_OF_MEMORY;
}
self->stream[self->stream_len++] = c;
return 0;
}
int PANDAS_INLINE end_field(parser_t *self) {
// XXX cruft
if (self->words_len >= self->words_cap) {
TRACE(
("end_field: ERROR!!! self->words_len(%zu) >= "
"self->words_cap(%zu)\n",
self->words_len, self->words_cap))
int64_t bufsize = 100;
self->error_msg = malloc(bufsize);
snprintf(self->error_msg, bufsize,
"Buffer overflow caught - possible malformed input file.\n");
return PARSER_OUT_OF_MEMORY;
}
// null terminate token
push_char(self, '\0');
// set pointer and metadata
self->words[self->words_len] = self->pword_start;
TRACE(("end_field: Char diff: %d\n", self->pword_start - self->words[0]));
TRACE(("end_field: Saw word %s at: %d. Total: %d\n", self->pword_start,
self->word_start, self->words_len + 1))
self->word_starts[self->words_len] = self->word_start;
self->words_len++;
// increment line field count
self->line_fields[self->lines]++;
// New field begin in stream
self->pword_start = self->stream + self->stream_len;
self->word_start = self->stream_len;
return 0;
}
static void append_warning(parser_t *self, const char *msg) {
int64_t ex_length;
int64_t length = strlen(msg);
void *newptr;
if (self->warn_msg == NULL) {
self->warn_msg = malloc(length + 1);
snprintf(self->warn_msg, length + 1, "%s", msg);
} else {
ex_length = strlen(self->warn_msg);
newptr = realloc(self->warn_msg, ex_length + length + 1);
if (newptr != NULL) {
self->warn_msg = (char *)newptr;
snprintf(self->warn_msg + ex_length, length + 1, "%s", msg);
}
}
}
static int end_line(parser_t *self) {
char *msg;
int64_t fields;
int64_t ex_fields = self->expected_fields;
int64_t bufsize = 100; // for error or warning messages
fields = self->line_fields[self->lines];
TRACE(("end_line: Line end, nfields: %d\n", fields));
TRACE(("end_line: lines: %d\n", self->lines));
if (self->lines > 0) {
if (self->expected_fields >= 0) {
ex_fields = self->expected_fields;
} else {
ex_fields = self->line_fields[self->lines - 1];
}
}
TRACE(("end_line: ex_fields: %d\n", ex_fields));
if (self->state == START_FIELD_IN_SKIP_LINE ||
self->state == IN_FIELD_IN_SKIP_LINE ||
self->state == IN_QUOTED_FIELD_IN_SKIP_LINE ||
self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE) {
TRACE(("end_line: Skipping row %d\n", self->file_lines));
// increment file line count
self->file_lines++;
// skip the tokens from this bad line
self->line_start[self->lines] += fields;
// reset field count
self->line_fields[self->lines] = 0;
return 0;
}
if (!(self->lines <= self->header_end + 1) &&
(fields > ex_fields) && !(self->usecols)) {
// increment file line count
self->file_lines++;
// skip the tokens from this bad line
self->line_start[self->lines] += fields;
// reset field count
self->line_fields[self->lines] = 0;
// file_lines is now the actual file line number (starting at 1)
if (self->on_bad_lines == ERROR) {
self->error_msg = malloc(bufsize);
snprintf(self->error_msg, bufsize,
"Expected %" PRId64 " fields in line %" PRIu64 ", saw %"
PRId64 "\n", ex_fields, self->file_lines, fields);
TRACE(("Error at line %d, %d fields\n", self->file_lines, fields));
return -1;
} else {
// simply skip bad lines
if (self->on_bad_lines == WARN) {
// pass up error message
msg = malloc(bufsize);
snprintf(msg, bufsize,
"Skipping line %" PRIu64 ": expected %" PRId64
" fields, saw %" PRId64 "\n",
self->file_lines, ex_fields, fields);
append_warning(self, msg);
free(msg);
}
}
} else {
// missing trailing delimiters
if ((self->lines >= self->header_end + 1) &&
fields < ex_fields) {
// might overrun the buffer when closing fields
if (make_stream_space(self, ex_fields - fields) < 0) {
int64_t bufsize = 100;
self->error_msg = malloc(bufsize);
snprintf(self->error_msg, bufsize, "out of memory");
return -1;
}
while (fields < ex_fields) {
end_field(self);
fields++;
}
}
// increment both line counts
self->file_lines++;
self->lines++;
// good line, set new start point
if (self->lines >= self->lines_cap) {
TRACE((
"end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n",
self->lines, self->lines_cap))
int64_t bufsize = 100;
self->error_msg = malloc(bufsize);
snprintf(self->error_msg, bufsize,
"Buffer overflow caught - "
"possible malformed input file.\n");
return PARSER_OUT_OF_MEMORY;
}
self->line_start[self->lines] =
(self->line_start[self->lines - 1] + fields);
TRACE(
("end_line: new line start: %d\n", self->line_start[self->lines]));
// new line start with 0 fields
self->line_fields[self->lines] = 0;
}
TRACE(("end_line: Finished line, at %d\n", self->lines));
return 0;
}
int parser_add_skiprow(parser_t *self, int64_t row) {
khiter_t k;
kh_int64_t *set;
int ret = 0;
if (self->skipset == NULL) {
self->skipset = (void *)kh_init_int64();
}
set = (kh_int64_t *)self->skipset;
k = kh_put_int64(set, row, &ret);
set->keys[k] = row;
return 0;
}
int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) {
// self->file_lines is zero based so subtract 1 from nrows
if (nrows > 0) {
self->skip_first_N_rows = nrows - 1;
}
return 0;
}
static int parser_buffer_bytes(parser_t *self, size_t nbytes,
const char *encoding_errors) {
int status;
size_t bytes_read;
status = 0;
self->datapos = 0;
self->data = self->cb_io(self->source, nbytes, &bytes_read, &status,
encoding_errors);
TRACE((
"parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n",
nbytes, bytes_read, status));
self->datalen = bytes_read;
if (status != REACHED_EOF && self->data == NULL) {
int64_t bufsize = 200;
self->error_msg = malloc(bufsize);
if (status == CALLING_READ_FAILED) {
snprintf(self->error_msg, bufsize,
"Calling read(nbytes) on source failed. "
"Try engine='python'.");
} else {
snprintf(self->error_msg, bufsize, "Unknown error in IO callback");
}
return -1;
}
TRACE(("datalen: %d\n", self->datalen));
return status;
}
/*
Tokenization macros and state machine code
*/
#define PUSH_CHAR(c) \
TRACE( \
("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \
c, slen, self->stream_cap, self->stream_len)) \
if (slen >= self->stream_cap) { \
TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \
self->stream_cap)) \
int64_t bufsize = 100; \
self->error_msg = malloc(bufsize); \
snprintf(self->error_msg, bufsize, \
"Buffer overflow caught - possible malformed input file.\n");\
return PARSER_OUT_OF_MEMORY; \
} \
*stream++ = c; \
slen++;
// This is a little bit of a hack but works for now
#define END_FIELD() \
self->stream_len = slen; \
if (end_field(self) < 0) { \
goto parsingerror; \
} \
stream = self->stream + self->stream_len; \
slen = self->stream_len;
#define END_LINE_STATE(STATE) \
self->stream_len = slen; \
if (end_line(self) < 0) { \
goto parsingerror; \
} \
stream = self->stream + self->stream_len; \
slen = self->stream_len; \
self->state = STATE; \
if (line_limit > 0 && self->lines == start_lines + line_limit) { \
goto linelimit; \
}
#define END_LINE_AND_FIELD_STATE(STATE) \
self->stream_len = slen; \
if (end_line(self) < 0) { \
goto parsingerror; \
} \
if (end_field(self) < 0) { \
goto parsingerror; \
} \
stream = self->stream + self->stream_len; \
slen = self->stream_len; \
self->state = STATE; \
if (line_limit > 0 && self->lines == start_lines + line_limit) { \
goto linelimit; \
}
#define END_LINE() END_LINE_STATE(START_RECORD)
#define IS_TERMINATOR(c) \
(c == line_terminator)
#define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE))
// don't parse '\r' with a custom line terminator
#define IS_CARRIAGE(c) (c == carriage_symbol)
#define IS_COMMENT_CHAR(c) (c == comment_symbol)
#define IS_ESCAPE_CHAR(c) (c == escape_symbol)
#define IS_SKIPPABLE_SPACE(c) \
((!self->delim_whitespace && c == ' ' && self->skipinitialspace))
// applied when in a field
#define IS_DELIMITER(c) \
((!self->delim_whitespace && c == self->delimiter) || \
(self->delim_whitespace && isblank(c)))
#define _TOKEN_CLEANUP() \
self->stream_len = slen; \
self->datapos = i; \
TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, \
self->datalen));
#define CHECK_FOR_BOM() \
if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \
buf += 3; \
self->datapos += 3; \
}
int skip_this_line(parser_t *self, int64_t rownum) {
int should_skip;
PyObject *result;
PyGILState_STATE state;
if (self->skipfunc != NULL) {
state = PyGILState_Ensure();
result = PyObject_CallFunction(self->skipfunc, "i", rownum);
// Error occurred. It will be processed
// and caught at the Cython level.
if (result == NULL) {
should_skip = -1;
} else {
should_skip = PyObject_IsTrue(result);
}
Py_XDECREF(result);
PyGILState_Release(state);
return should_skip;
} else if (self->skipset != NULL) {
return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) !=
((kh_int64_t *)self->skipset)->n_buckets);
} else {
return (rownum <= self->skip_first_N_rows);
}
}
int tokenize_bytes(parser_t *self,
size_t line_limit, uint64_t start_lines) {
int64_t i;
uint64_t slen;
int should_skip;
char c;
char *stream;
char *buf = self->data + self->datapos;
const char line_terminator = (self->lineterminator == '\0') ?
'\n' : self->lineterminator;
// 1000 is something that couldn't fit in "char"
// thus comparing a char to it would always be "false"
const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000;
const int comment_symbol = (self->commentchar != '\0') ?
self->commentchar : 1000;
const int escape_symbol = (self->escapechar != '\0') ?
self->escapechar : 1000;
if (make_stream_space(self, self->datalen - self->datapos) < 0) {
int64_t bufsize = 100;
self->error_msg = malloc(bufsize);
snprintf(self->error_msg, bufsize, "out of memory");
return -1;
}
stream = self->stream + self->stream_len;
slen = self->stream_len;
TRACE(("%s\n", buf));
if (self->file_lines == 0) {
CHECK_FOR_BOM();
}
for (i = self->datapos; i < self->datalen; ++i) {
// next character in file
c = *buf++;
TRACE(
("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, "
"state %d\n",
i, c, self->file_lines + 1, self->line_fields[self->lines],
self->state));
switch (self->state) {
case START_FIELD_IN_SKIP_LINE:
if (IS_TERMINATOR(c)) {
END_LINE();
} else if (IS_CARRIAGE(c)) {
self->file_lines++;
self->state = EAT_CRNL_NOP;
} else if (IS_QUOTE(c)) {
self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
} else if (IS_DELIMITER(c)) {
// Do nothing, we're starting a new field again.
} else {
self->state = IN_FIELD_IN_SKIP_LINE;
}
break;
case IN_FIELD_IN_SKIP_LINE:
if (IS_TERMINATOR(c)) {
END_LINE();
} else if (IS_CARRIAGE(c)) {
self->file_lines++;
self->state = EAT_CRNL_NOP;
} else if (IS_DELIMITER(c)) {
self->state = START_FIELD_IN_SKIP_LINE;
}
break;
case IN_QUOTED_FIELD_IN_SKIP_LINE:
if (IS_QUOTE(c)) {
if (self->doublequote) {
self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE;
} else {
self->state = IN_FIELD_IN_SKIP_LINE;
}
}
break;
case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE:
if (IS_QUOTE(c)) {
self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
} else if (IS_TERMINATOR(c)) {
END_LINE();
} else if (IS_CARRIAGE(c)) {
self->file_lines++;
self->state = EAT_CRNL_NOP;
} else if (IS_DELIMITER(c)) {
self->state = START_FIELD_IN_SKIP_LINE;
} else {
self->state = IN_FIELD_IN_SKIP_LINE;
}
break;
case WHITESPACE_LINE:
if (IS_TERMINATOR(c)) {
self->file_lines++;
self->state = START_RECORD;
break;
} else if (IS_CARRIAGE(c)) {
self->file_lines++;
self->state = EAT_CRNL_NOP;
break;
} else if (!self->delim_whitespace) {
if (isblank(c) && c != self->delimiter) {
} else { // backtrack
// use i + 1 because buf has been incremented but not i
do {
--buf;
--i;
} while (i + 1 > self->datapos && !IS_TERMINATOR(*buf));
// reached a newline rather than the beginning
if (IS_TERMINATOR(*buf)) {
++buf; // move pointer to first char after newline
++i;
}
self->state = START_FIELD;
}
break;
}
// fall through
case EAT_WHITESPACE:
if (IS_TERMINATOR(c)) {
END_LINE();
self->state = START_RECORD;
break;
} else if (IS_CARRIAGE(c)) {
self->state = EAT_CRNL;
break;
} else if (IS_COMMENT_CHAR(c)) {
self->state = EAT_COMMENT;
break;
} else if (!isblank(c)) {
self->state = START_FIELD;
// fall through to subsequent state
} else {
// if whitespace char, keep slurping
break;
}
case START_RECORD:
// start of record
should_skip = skip_this_line(self, self->file_lines);
if (should_skip == -1) {
goto parsingerror;
} else if (should_skip) {
if (IS_QUOTE(c)) {
self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
} else {
self->state = IN_FIELD_IN_SKIP_LINE;
if (IS_TERMINATOR(c)) {
END_LINE();
}
}
break;
} else if (IS_TERMINATOR(c)) {
// \n\r possible?
if (self->skip_empty_lines) {
self->file_lines++;
} else {
END_LINE();
}
break;
} else if (IS_CARRIAGE(c)) {
if (self->skip_empty_lines) {
self->file_lines++;
self->state = EAT_CRNL_NOP;
} else {
self->state = EAT_CRNL;
}
break;
} else if (IS_COMMENT_CHAR(c)) {
self->state = EAT_LINE_COMMENT;
break;
} else if (isblank(c)) {
if (self->delim_whitespace) {
if (self->skip_empty_lines) {
self->state = WHITESPACE_LINE;
} else {
self->state = EAT_WHITESPACE;
}
break;
} else if (c != self->delimiter && self->skip_empty_lines) {
self->state = WHITESPACE_LINE;
break;
}
// fall through
}
// normal character - fall through
// to handle as START_FIELD
self->state = START_FIELD;
case START_FIELD:
// expecting field
if (IS_TERMINATOR(c)) {
END_FIELD();
END_LINE();
} else if (IS_CARRIAGE(c)) {
END_FIELD();
self->state = EAT_CRNL;
} else if (IS_QUOTE(c)) {
// start quoted field
self->state = IN_QUOTED_FIELD;
} else if (IS_ESCAPE_CHAR(c)) {
// possible escaped character
self->state = ESCAPED_CHAR;
} else if (IS_SKIPPABLE_SPACE(c)) {
// ignore space at start of field
} else if (IS_DELIMITER(c)) {
if (self->delim_whitespace) {
self->state = EAT_WHITESPACE;
} else {
// save empty field
END_FIELD();
}
} else if (IS_COMMENT_CHAR(c)) {
END_FIELD();
self->state = EAT_COMMENT;
} else {
// begin new unquoted field
PUSH_CHAR(c);
self->state = IN_FIELD;
}
break;
case ESCAPED_CHAR:
PUSH_CHAR(c);
self->state = IN_FIELD;
break;
case EAT_LINE_COMMENT:
if (IS_TERMINATOR(c)) {
self->file_lines++;
self->state = START_RECORD;
} else if (IS_CARRIAGE(c)) {
self->file_lines++;
self->state = EAT_CRNL_NOP;
}
break;
case IN_FIELD:
// in unquoted field
if (IS_TERMINATOR(c)) {
END_FIELD();
END_LINE();
} else if (IS_CARRIAGE(c)) {
END_FIELD();
self->state = EAT_CRNL;
} else if (IS_ESCAPE_CHAR(c)) {
// possible escaped character
self->state = ESCAPED_CHAR;
} else if (IS_DELIMITER(c)) {
// end of field - end of line not reached yet
END_FIELD();
if (self->delim_whitespace) {
self->state = EAT_WHITESPACE;
} else {
self->state = START_FIELD;
}
} else if (IS_COMMENT_CHAR(c)) {
END_FIELD();
self->state = EAT_COMMENT;
} else {
// normal character - save in field
PUSH_CHAR(c);
}
break;
case IN_QUOTED_FIELD:
// in quoted field
if (IS_ESCAPE_CHAR(c)) {
// possible escape character
self->state = ESCAPE_IN_QUOTED_FIELD;
} else if (IS_QUOTE(c)) {
if (self->doublequote) {
// double quote - " represented by ""
self->state = QUOTE_IN_QUOTED_FIELD;
} else {
// end of quote part of field
self->state = IN_FIELD;
}
} else {
// normal character - save in field
PUSH_CHAR(c);
}
break;
case ESCAPE_IN_QUOTED_FIELD:
PUSH_CHAR(c);
self->state = IN_QUOTED_FIELD;
break;
case QUOTE_IN_QUOTED_FIELD:
// double quote - seen a quote in an quoted field
if (IS_QUOTE(c)) {
// save "" as "
PUSH_CHAR(c);
self->state = IN_QUOTED_FIELD;
} else if (IS_DELIMITER(c)) {
// end of field - end of line not reached yet
END_FIELD();
if (self->delim_whitespace) {
self->state = EAT_WHITESPACE;
} else {
self->state = START_FIELD;
}
} else if (IS_TERMINATOR(c)) {
END_FIELD();
END_LINE();
} else if (IS_CARRIAGE(c)) {
END_FIELD();
self->state = EAT_CRNL;
} else {
PUSH_CHAR(c);
self->state = IN_FIELD;
}
break;
case EAT_COMMENT:
if (IS_TERMINATOR(c)) {
END_LINE();
} else if (IS_CARRIAGE(c)) {
self->state = EAT_CRNL;
}
break;
// only occurs with non-custom line terminator,
// which is why we directly check for '\n'
case EAT_CRNL:
if (c == '\n') {
END_LINE();
} else if (IS_DELIMITER(c)) {
if (self->delim_whitespace) {
END_LINE_STATE(EAT_WHITESPACE);
} else {
// Handle \r-delimited files
END_LINE_AND_FIELD_STATE(START_FIELD);
}
} else {
if (self->delim_whitespace) {
/* XXX
* first character of a new record--need to back up and
* reread
* to handle properly...
*/
i--;
buf--; // back up one character (HACK!)
END_LINE_STATE(START_RECORD);
} else {
// \r line terminator
// UGH. we don't actually want
// to consume the token. fix this later
self->stream_len = slen;
if (end_line(self) < 0) {
goto parsingerror;
}
stream = self->stream + self->stream_len;
slen = self->stream_len;
self->state = START_RECORD;
--i;
buf--; // let's try this character again (HACK!)
if (line_limit > 0 &&
self->lines == start_lines + line_limit) {
goto linelimit;
}
}
}
break;
// only occurs with non-custom line terminator,
// which is why we directly check for '\n'
case EAT_CRNL_NOP: // inside an ignored comment line
self->state = START_RECORD;
// \r line terminator -- parse this character again
if (c != '\n' && !IS_DELIMITER(c)) {
--i;
--buf;
}
break;
default:
break;
}
}
_TOKEN_CLEANUP();
TRACE(("Finished tokenizing input\n"))
return 0;
parsingerror:
i++;
_TOKEN_CLEANUP();
return -1;
linelimit:
i++;
_TOKEN_CLEANUP();
return 0;
}
static int parser_handle_eof(parser_t *self) {
int64_t bufsize = 100;
TRACE(
("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state))
if (self->datalen != 0) return -1;
switch (self->state) {
case START_RECORD:
case WHITESPACE_LINE:
case EAT_CRNL_NOP:
case EAT_LINE_COMMENT:
return 0;
case ESCAPE_IN_QUOTED_FIELD:
case IN_QUOTED_FIELD:
self->error_msg = (char *)malloc(bufsize);
snprintf(self->error_msg, bufsize,
"EOF inside string starting at row %" PRIu64,
self->file_lines);
return -1;
case ESCAPED_CHAR:
self->error_msg = (char *)malloc(bufsize);
snprintf(self->error_msg, bufsize,
"EOF following escape character");
return -1;
case IN_FIELD:
case START_FIELD:
case QUOTE_IN_QUOTED_FIELD:
if (end_field(self) < 0) return -1;
break;
default:
break;
}
if (end_line(self) < 0)
return -1;
else
return 0;
}
int parser_consume_rows(parser_t *self, size_t nrows) {
int64_t offset, word_deletions;
uint64_t char_count, i;
if (nrows > self->lines) {
nrows = self->lines;
}
/* do nothing */
if (nrows == 0) return 0;
/* cannot guarantee that nrows + 1 has been observed */
word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1];
if (word_deletions >= 1) {
char_count = (self->word_starts[word_deletions - 1] +
strlen(self->words[word_deletions - 1]) + 1);
} else {
/* if word_deletions == 0 (i.e. this case) then char_count must
* be 0 too, as no data needs to be skipped */
char_count = 0;
}
TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions,
char_count));
/* move stream, only if something to move */
if (char_count < self->stream_len) {
memmove(self->stream, (self->stream + char_count),
self->stream_len - char_count);
}
/* buffer counts */
self->stream_len -= char_count;
/* move token metadata */
// Note: We should always have words_len < word_deletions, so this
// subtraction will remain appropriately-typed.
for (i = 0; i < self->words_len - word_deletions; ++i) {
offset = i + word_deletions;
self->words[i] = self->words[offset] - char_count;
self->word_starts[i] = self->word_starts[offset] - char_count;
}
self->words_len -= word_deletions;
/* move current word pointer to stream */
self->pword_start -= char_count;
self->word_start -= char_count;
/* move line metadata */
// Note: We should always have self->lines - nrows + 1 >= 0, so this
// subtraction will remain appropriately-typed.
for (i = 0; i < self->lines - nrows + 1; ++i) {
offset = i + nrows;
self->line_start[i] = self->line_start[offset] - word_deletions;
self->line_fields[i] = self->line_fields[offset];
}
self->lines -= nrows;
return 0;
}
static size_t _next_pow2(size_t sz) {
size_t result = 1;
while (result < sz) result *= 2;
return result;
}
int parser_trim_buffers(parser_t *self) {
/*
Free memory
*/
size_t new_cap;
void *newptr;
uint64_t i;
/**
* Before we free up space and trim, we should
* save how many words we saw when parsing, if
* it exceeds the maximum number we saw before.
*
* This is important for when we read in chunks,
* so that we can inform subsequent chunk parsing
* as to how many words we could possibly see.
*/
if (self->words_cap > self->max_words_cap) {
self->max_words_cap = self->words_cap;
}
/* trim words, word_starts */
new_cap = _next_pow2(self->words_len) + 1;
if (new_cap < self->words_cap) {
TRACE(("parser_trim_buffers: new_cap < self->words_cap\n"));
self->words = realloc(self->words, new_cap * sizeof(char *));
if (self->words == NULL) {
return PARSER_OUT_OF_MEMORY;
}
self->word_starts = realloc(self->word_starts,
new_cap * sizeof(int64_t));
if (self->word_starts == NULL) {
return PARSER_OUT_OF_MEMORY;
}
self->words_cap = new_cap;
}
/* trim stream */
new_cap = _next_pow2(self->stream_len) + 1;
TRACE(
("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = "
"%zu\n",
new_cap, self->stream_cap, self->lines_cap));
if (new_cap < self->stream_cap) {
TRACE(
("parser_trim_buffers: new_cap < self->stream_cap, calling "
"realloc\n"));
newptr = realloc(self->stream, new_cap);
if (newptr == NULL) {
return PARSER_OUT_OF_MEMORY;
} else {
// Update the pointers in the self->words array (char **) if
// `realloc`
// moved the `self->stream` buffer. This block mirrors a similar
// block in
// `make_stream_space`.
if (self->stream != newptr) {
self->pword_start = (char *)newptr + self->word_start;
for (i = 0; i < self->words_len; ++i) {
self->words[i] = (char *)newptr + self->word_starts[i];
}
}
self->stream = newptr;
self->stream_cap = new_cap;
}
}
/* trim line_start, line_fields */
new_cap = _next_pow2(self->lines) + 1;
if (new_cap < self->lines_cap) {
TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n"));
newptr = realloc(self->line_start,
new_cap * sizeof(int64_t));
if (newptr == NULL) {
return PARSER_OUT_OF_MEMORY;
} else {
self->line_start = newptr;
}
newptr = realloc(self->line_fields,
new_cap * sizeof(int64_t));
if (newptr == NULL) {
return PARSER_OUT_OF_MEMORY;
} else {
self->line_fields = newptr;
self->lines_cap = new_cap;
}
}
return 0;
}
/*
nrows : number of rows to tokenize (or until reach EOF)
all : tokenize all the data vs. certain number of rows
*/
int _tokenize_helper(parser_t *self, size_t nrows, int all,
const char *encoding_errors) {
int status = 0;
uint64_t start_lines = self->lines;
if (self->state == FINISHED) {
return 0;
}
TRACE((
"_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n",
nrows, self->datapos, self->datalen));
while (1) {
if (!all && self->lines - start_lines >= nrows) break;
if (self->datapos == self->datalen) {
status = parser_buffer_bytes(self, self->chunksize,
encoding_errors);
if (status == REACHED_EOF) {
// close out last line
status = parser_handle_eof(self);
self->state = FINISHED;
break;
} else if (status != 0) {
return status;
}
}
TRACE(
("_tokenize_helper: Trying to process %d bytes, datalen=%d, "
"datapos= %d\n",
self->datalen - self->datapos, self->datalen, self->datapos));
status = tokenize_bytes(self, nrows, start_lines);
if (status < 0) {
// XXX
TRACE(
("_tokenize_helper: Status %d returned from tokenize_bytes, "
"breaking\n",
status));
status = -1;
break;
}
}
TRACE(("leaving tokenize_helper\n"));
return status;
}
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) {
int status = _tokenize_helper(self, nrows, 0, encoding_errors);
return status;
}
int tokenize_all_rows(parser_t *self, const char *encoding_errors) {
int status = _tokenize_helper(self, -1, 1, encoding_errors);
return status;
}
/*
* Function: to_boolean
* --------------------
*
* Validate if item should be recognized as a boolean field.
*
* item: const char* representing parsed text
* val : pointer to a uint8_t of boolean representation
*
* If item is determined to be boolean, this method will set
* the appropriate value of val and return 0. A non-zero exit
* status means that item was not inferred to be boolean, and
* leaves the value of *val unmodified.
*/
int to_boolean(const char *item, uint8_t *val) {
if (strcasecmp(item, "TRUE") == 0) {
*val = 1;
return 0;
} else if (strcasecmp(item, "FALSE") == 0) {
*val = 0;
return 0;
}
return -1;
}
// ---------------------------------------------------------------------------
// Implementation of xstrtod
//
// strtod.c
//
// Convert string to double
//
// Copyright (C) 2002 Michael Ringgaard. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// 3. Neither the name of the project nor the names of its contributors
// may be used to endorse or promote products derived from this software
// without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
// SUCH DAMAGE.
//
// -----------------------------------------------------------------------
// Modifications by Warren Weckesser, March 2011:
// * Rename strtod() to xstrtod().
// * Added decimal and sci arguments.
// * Skip trailing spaces.
// * Commented out the other functions.
// Modifications by Richard T Guy, August 2013:
// * Add tsep argument for thousands separator
//
// pessimistic but quick assessment,
// assuming that each decimal digit requires 4 bits to store
const int max_int_decimal_digits = (sizeof(unsigned int) * 8) / 4;
double xstrtod(const char *str, char **endptr, char decimal, char sci,
char tsep, int skip_trailing, int *error, int *maybe_int) {
double number;
unsigned int i_number = 0;
int exponent;
int negative;
char *p = (char *)str;
double p10;
int n;
int num_digits;
int num_decimals;
if (maybe_int != NULL) *maybe_int = 1;
// Skip leading whitespace.
while (isspace_ascii(*p)) p++;
// Handle optional sign.
negative = 0;
switch (*p) {
case '-':
negative = 1; // Fall through to increment position.
case '+':
p++;
}
exponent = 0;
num_digits = 0;
num_decimals = 0;
// Process string of digits.
while (isdigit_ascii(*p) && num_digits <= max_int_decimal_digits) {
i_number = i_number * 10 + (*p - '0');
p++;
num_digits++;
p += (tsep != '\0' && *p == tsep);
}
number = i_number;
if (num_digits > max_int_decimal_digits) {
// process what's left as double
while (isdigit_ascii(*p)) {
number = number * 10. + (*p - '0');
p++;
num_digits++;
p += (tsep != '\0' && *p == tsep);
}
}
// Process decimal part.
if (*p == decimal) {
if (maybe_int != NULL) *maybe_int = 0;
p++;
while (isdigit_ascii(*p)) {
number = number * 10. + (*p - '0');
p++;
num_digits++;
num_decimals++;
}
exponent -= num_decimals;
}
if (num_digits == 0) {
*error = ERANGE;
return 0.0;
}
// Correct for sign.
if (negative) number = -number;
// Process an exponent string.
if (toupper_ascii(*p) == toupper_ascii(sci)) {
if (maybe_int != NULL) *maybe_int = 0;
// Handle optional sign.
negative = 0;
switch (*++p) {
case '-':
negative = 1; // Fall through to increment pos.
case '+':
p++;
}
// Process string of digits.
num_digits = 0;
n = 0;
while (isdigit_ascii(*p)) {
n = n * 10 + (*p - '0');
num_digits++;
p++;
}
if (negative)
exponent -= n;
else
exponent += n;
// If no digits, after the 'e'/'E', un-consume it
if (num_digits == 0) p--;
}
if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) {
*error = ERANGE;
return HUGE_VAL;
}
// Scale the result.
p10 = 10.;
n = exponent;
if (n < 0) n = -n;
while (n) {
if (n & 1) {
if (exponent < 0)
number /= p10;
else
number *= p10;
}
n >>= 1;
p10 *= p10;
}
if (number == HUGE_VAL) {
*error = ERANGE;
}
if (skip_trailing) {
// Skip trailing whitespace.
while (isspace_ascii(*p)) p++;
}
if (endptr) *endptr = p;
return number;
}
double precise_xstrtod(const char *str, char **endptr, char decimal,
char sci, char tsep, int skip_trailing,
int *error, int *maybe_int) {
double number;
int exponent;
int negative;
char *p = (char *)str;
int num_digits;
int num_decimals;
int max_digits = 17;
int n;
if (maybe_int != NULL) *maybe_int = 1;
// Cache powers of 10 in memory.
static double e[] = {
1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9,
1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29,
1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39,
1e40, 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49,
1e50, 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59,
1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69,
1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79,
1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89,
1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99,
1e100, 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109,
1e110, 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119,
1e120, 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129,
1e130, 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139,
1e140, 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149,
1e150, 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159,
1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169,
1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179,
1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189,
1e190, 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199,
1e200, 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209,
1e210, 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219,
1e220, 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229,
1e230, 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239,
1e240, 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249,
1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259,
1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269,
1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279,
1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289,
1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299,
1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308};
// Skip leading whitespace.
while (isspace_ascii(*p)) p++;
// Handle optional sign.
negative = 0;
switch (*p) {
case '-':
negative = 1; // Fall through to increment position.
case '+':
p++;
}
number = 0.;
exponent = 0;
num_digits = 0;
num_decimals = 0;
// Process string of digits.
while (isdigit_ascii(*p)) {
if (num_digits < max_digits) {
number = number * 10. + (*p - '0');
num_digits++;
} else {
++exponent;
}
p++;
p += (tsep != '\0' && *p == tsep);
}
// Process decimal part
if (*p == decimal) {
if (maybe_int != NULL) *maybe_int = 0;
p++;
while (num_digits < max_digits && isdigit_ascii(*p)) {
number = number * 10. + (*p - '0');
p++;
num_digits++;
num_decimals++;
}
if (num_digits >= max_digits) // Consume extra decimal digits.
while (isdigit_ascii(*p)) ++p;
exponent -= num_decimals;
}
if (num_digits == 0) {
*error = ERANGE;
return 0.0;
}
// Correct for sign.
if (negative) number = -number;
// Process an exponent string.
if (toupper_ascii(*p) == toupper_ascii(sci)) {
if (maybe_int != NULL) *maybe_int = 0;
// Handle optional sign
negative = 0;
switch (*++p) {
case '-':
negative = 1; // Fall through to increment pos.
case '+':
p++;
}
// Process string of digits.
num_digits = 0;
n = 0;
while (num_digits < max_digits && isdigit_ascii(*p)) {
n = n * 10 + (*p - '0');
num_digits++;
p++;
}
if (negative)
exponent -= n;
else
exponent += n;
// If no digits after the 'e'/'E', un-consume it.
if (num_digits == 0) p--;
}
if (exponent > 308) {
*error = ERANGE;
return HUGE_VAL;
} else if (exponent > 0) {
number *= e[exponent];
} else if (exponent < -308) { // Subnormal
if (exponent < -616) { // Prevent invalid array access.
number = 0.;
} else {
number /= e[-308 - exponent];
number /= e[308];
}
} else {
number /= e[-exponent];
}
if (number == HUGE_VAL || number == -HUGE_VAL) *error = ERANGE;
if (skip_trailing) {
// Skip trailing whitespace.
while (isspace_ascii(*p)) p++;
}
if (endptr) *endptr = p;
return number;
}
/* copy a decimal number string with `decimal`, `tsep` as decimal point
and thousands separator to an equivalent c-locale decimal string (striping
`tsep`, replacing `decimal` with '.'). The returned memory should be free-d
with a call to `free`.
*/
char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
char tsep) {
const char *p = s;
size_t length = strlen(s);
char *s_copy = malloc(length + 1);
char *dst = s_copy;
// Skip leading whitespace.
while (isspace_ascii(*p)) p++;
// Copy Leading sign
if (*p == '+' || *p == '-') {
*dst++ = *p++;
}
// Copy integer part dropping `tsep`
while (isdigit_ascii(*p)) {
*dst++ = *p++;
p += (tsep != '\0' && *p == tsep);
}
// Replace `decimal` with '.'
if (*p == decimal) {
*dst++ = '.';
p++;
}
// Copy fractional part after decimal (if any)
while (isdigit_ascii(*p)) {
*dst++ = *p++;
}
// Copy exponent if any
if (toupper_ascii(*p) == toupper_ascii('E')) {
*dst++ = *p++;
// Copy leading exponent sign (if any)
if (*p == '+' || *p == '-') {
*dst++ = *p++;
}
// Copy exponent digits
while (isdigit_ascii(*p)) {
*dst++ = *p++;
}
}
*dst++ = '\0'; // terminate
if (endpos != NULL)
*endpos = (char *)p;
return s_copy;
}
double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
int skip_trailing, int *error, int *maybe_int) {
// 'normalize' representation to C-locale; replace decimal with '.' and
// remove t(housand)sep.
char *endptr;
char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep);
// This is called from a nogil block in parsers.pyx
// so need to explicitly get GIL before Python calls
PyGILState_STATE gstate;
gstate = PyGILState_Ensure();
char *endpc;
double r = PyOS_string_to_double(pc, &endpc, 0);
// PyOS_string_to_double needs to consume the whole string
if (endpc == pc + strlen(pc)) {
if (q != NULL) {
// report endptr from source string (p)
*q = endptr;
}
} else {
*error = -1;
if (q != NULL) {
// p and pc are different len due to tsep removal. Can't report
// how much it has consumed of p. Just rewind to beginning.
*q = (char *)p; // TODO(willayd): this could be undefined behavior
}
}
if (maybe_int != NULL) *maybe_int = 0;
if (PyErr_Occurred() != NULL) *error = -1;
else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
PyErr_Clear();
PyGILState_Release(gstate);
free(pc);
if (skip_trailing && q != NULL && *q != p) {
while (isspace_ascii(**q)) {
(*q)++;
}
}
return r;
}
// End of xstrtod code
// ---------------------------------------------------------------------------
void uint_state_init(uint_state *self) {
self->seen_sint = 0;
self->seen_uint = 0;
self->seen_null = 0;
}
int uint64_conflict(uint_state *self) {
return self->seen_uint && (self->seen_sint || self->seen_null);
}
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
int *error, char tsep) {
const char *p = p_item;
int isneg = 0;
int64_t number = 0;
int d;
// Skip leading spaces.
while (isspace_ascii(*p)) {
++p;
}
// Handle sign.
if (*p == '-') {
isneg = 1;
++p;
} else if (*p == '+') {
p++;
}
// Check that there is a first digit.
if (!isdigit_ascii(*p)) {
// Error...
*error = ERROR_NO_DIGITS;
return 0;
}
if (isneg) {
// If number is greater than pre_min, at least one more digit
// can be processed without overflowing.
int dig_pre_min = -(int_min % 10);
int64_t pre_min = int_min / 10;
// Process the digits.
d = *p;
if (tsep != '\0') {
while (1) {
if (d == tsep) {
d = *++p;
continue;
} else if (!isdigit_ascii(d)) {
break;
}
if ((number > pre_min) ||
((number == pre_min) && (d - '0' <= dig_pre_min))) {
number = number * 10 - (d - '0');
d = *++p;
} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
} else {
while (isdigit_ascii(d)) {
if ((number > pre_min) ||
((number == pre_min) && (d - '0' <= dig_pre_min))) {
number = number * 10 - (d - '0');
d = *++p;
} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
}
} else {
// If number is less than pre_max, at least one more digit
// can be processed without overflowing.
int64_t pre_max = int_max / 10;
int dig_pre_max = int_max % 10;
// Process the digits.
d = *p;
if (tsep != '\0') {
while (1) {
if (d == tsep) {
d = *++p;
continue;
} else if (!isdigit_ascii(d)) {
break;
}
if ((number < pre_max) ||
((number == pre_max) && (d - '0' <= dig_pre_max))) {
number = number * 10 + (d - '0');
d = *++p;
} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
} else {
while (isdigit_ascii(d)) {
if ((number < pre_max) ||
((number == pre_max) && (d - '0' <= dig_pre_max))) {
number = number * 10 + (d - '0');
d = *++p;
} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
}
}
// Skip trailing spaces.
while (isspace_ascii(*p)) {
++p;
}
// Did we use up all the characters?
if (*p) {
*error = ERROR_INVALID_CHARS;
return 0;
}
*error = 0;
return number;
}
uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
uint64_t uint_max, int *error, char tsep) {
const char *p = p_item;
uint64_t pre_max = uint_max / 10;
int dig_pre_max = uint_max % 10;
uint64_t number = 0;
int d;
// Skip leading spaces.
while (isspace_ascii(*p)) {
++p;
}
// Handle sign.
if (*p == '-') {
state->seen_sint = 1;
*error = 0;
return 0;
} else if (*p == '+') {
p++;
}
// Check that there is a first digit.
if (!isdigit_ascii(*p)) {
// Error...
*error = ERROR_NO_DIGITS;
return 0;
}
// If number is less than pre_max, at least one more digit
// can be processed without overflowing.
//
// Process the digits.
d = *p;
if (tsep != '\0') {
while (1) {
if (d == tsep) {
d = *++p;
continue;
} else if (!isdigit_ascii(d)) {
break;
}
if ((number < pre_max) ||
((number == pre_max) && (d - '0' <= dig_pre_max))) {
number = number * 10 + (d - '0');
d = *++p;
} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
} else {
while (isdigit_ascii(d)) {
if ((number < pre_max) ||
((number == pre_max) && (d - '0' <= dig_pre_max))) {
number = number * 10 + (d - '0');
d = *++p;
} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
}
// Skip trailing spaces.
while (isspace_ascii(*p)) {
++p;
}
// Did we use up all the characters?
if (*p) {
*error = ERROR_INVALID_CHARS;
return 0;
}
if (number > (uint64_t)int_max) {
state->seen_uint = 1;
}
*error = 0;
return number;
}