Repository URL to install this package:
|
Version:
2.68.0.50 ▾
|
"""Database utilities consists of routines for obfuscating SQL, retrieving
explain plans for SQL etc.
"""
import logging
import re
import time
import weakref
from newrelic.core.internal_metrics import (internal_trace, internal_metric)
from newrelic.core.config import global_settings
_logger = logging.getLogger(__name__)
# Obfuscation of SQL is done when reporting SQL statements back to the
# data collector so that sensitive information is not being passed.
# Obfuscation consists of replacing any quoted strings, integer or float
# literals with a '?'. For quoted strings which types of quoted strings
# should be collapsed depend on the database in use.
# See http://stackoverflow.com/questions/6718874.
#
# Escaping of quotes in SQL isn't like normal C style string. That is,
# no backslash. Uses two successive instances of quote character in
# middle of the string to indicate one embedded quote.
_single_quotes_p = "'(?:[^']|'')*'"
_double_quotes_p = '"(?:[^"]|"")*"'
_any_quotes_p = _single_quotes_p + '|' + _double_quotes_p
_single_quotes_re = re.compile(_single_quotes_p)
_double_quotes_re = re.compile(_double_quotes_p)
_any_quotes_re = re.compile(_any_quotes_p)
# See http://www.regular-expressions.info/examplesprogrammer.html.
#
# Is important to take word boundaries into consideration so we do not
# match numbers which are used on the end of identifiers. Technically
# this will not match numbers on the start of identifiers even though
# leading digits on identifiers aren't valid anyway. As it shouldn't
# occur, shouldn't be an issue.
#
# We add one variation here in that don't want to replace a number that
# follows on from a ':'. This is because ':1' can be used as positional
# parameter with database adapters where 'paramstyle' is 'numeric'.
_int_re = re.compile(r'(?<!:)\b\d+\b')
_quotes_table = {
'single': _single_quotes_re,
'double': _double_quotes_re,
'single+double': _any_quotes_re,
}
_quotes_default = _single_quotes_re
@internal_trace('Supportability/Python/DatabaseUtils/Calls/obfuscate_sql')
def _obfuscate_sql(sql, database):
quotes_re = _quotes_table.get(database.quoting_style, _single_quotes_re)
# Substitute quoted strings first.
sql = quotes_re.sub('?', sql)
# Replace straight integer values. This will pick up
# integers by themselves but also as part of floating point
# numbers. Because of word boundary checks in pattern will
# not match numbers within identifier names.
sql = _int_re.sub('?', sql)
return sql
# Normalization of the SQL is done so that when we can produce a hash
# value for a slow SQL such that it generates the same value for two SQL
# statements where only difference is values that may have been used.
#
# The main thing we need to contend with is the value sets where can
# have variable numbers of values for which we collapse them down to a
# single value. We also need to replace all the different variations of
# param styles with a '?'. This is in case in one situation a literal
# was used but in another it was a param, but param style is something
# other than '?' that literals are otherwise converted to. We also strip
# out all whitespace between identifiers and non identifiers to cope
# with varying amounts being used in different cases. A single space is
# left between identifiers.
#
# Note that we pickup up both ':1' and ':name' with the sub pattern
# ':\w+'. This can match ':1name', which is not strictly correct, but
# then it likely isn't valid in SQL anyway for that param style.
_normalize_params_1_p = r'%\([^)]*\)s'
_normalize_params_1_re = re.compile(_normalize_params_1_p)
_normalize_params_2_p = r'%s'
_normalize_params_2_re = re.compile(_normalize_params_2_p)
_normalize_params_3_p = r':\w+'
_normalize_params_3_re = re.compile(_normalize_params_3_p)
_normalize_values_p = r'\([^)]+\)'
_normalize_values_re = re.compile(_normalize_values_p)
_normalize_whitespace_1_p = r'\s+'
_normalize_whitespace_1_re = re.compile(_normalize_whitespace_1_p)
_normalize_whitespace_2_p = r'\s+(?!\w)'
_normalize_whitespace_2_re = re.compile(_normalize_whitespace_2_p)
_normalize_whitespace_3_p = r'(?<!\w)\s+'
_normalize_whitespace_3_re = re.compile(_normalize_whitespace_3_p)
@internal_trace('Supportability/Python/DatabaseUtils/Calls/normalize_sql')
def _normalize_sql(sql):
# Note we that do this as a series of regular expressions as
# using '|' in regular expressions is more expensive.
# Convert param style of '%(name)s' to '?'. We need to do
# this before collapsing sets of values to a single value
# due to the use of the parenthesis in the param style.
sql = _normalize_params_1_re.sub('?', sql)
# Collapse any parenthesised set of values to a single value.
sql = _normalize_values_re.sub('(?)', sql)
# Convert '%s', ':1' and ':name' param styles to '?'.
sql = _normalize_params_2_re.sub('?', sql)
sql = _normalize_params_3_re.sub('?', sql)
# Strip leading and trailing white space.
sql = sql.strip()
# Collapse multiple white space to single white space.
sql = _normalize_whitespace_1_re.sub(' ', sql)
# Drop spaces adjacent to identifier except for case where
# identifiers follow each other.
sql = _normalize_whitespace_2_re.sub('', sql)
sql = _normalize_whitespace_3_re.sub('', sql)
return sql
# Helper function for extracting out any identifier from a string which
# might be preceded or followed by punctuation which we can expect in
# context of SQL statements.
#
# TODO This isn't always going to do what we require, but it is only used
# now for cases below which never get invoked, so is okay for now.
_identifier_re = re.compile('[\',"`\[\]\(\)]*')
def _extract_identifier(token):
return _identifier_re.sub('', token).strip().lower()
# Helper function for removing C style comments embedded in SQL statements.
_uncomment_sql_p = r'/\*.*?\*/'
_uncomment_sql_re = re.compile(_uncomment_sql_p, re.DOTALL)
def _uncomment_sql(sql):
return _uncomment_sql_re.sub('', sql)
# Parser routines for the different SQL statement operation types.
#
# Picking out the name of the target identifier for the specific
# operation is more than a bit tricky. This is because names can contain
# special characters. Quoting can also be used to allow any characters
# with escaping of quotes with such values following the SQL
# conventions.
#
# The most broad naming rule which includes all other databases appears
# to be SQL server:
#
# http://msdn.microsoft.com/en-us/library/ms175874.aspx
#
# which says:
#
# The first character must be one of the following:
#
# A letter as defined by the Unicode Standard 3.2. The Unicode
# definition of letters includes Latin characters from a through z,
# from A through Z, and also letter characters from other languages.
#
# The underscore (_), at sign (@), or number sign (#).
#
# Subsequent characters can include the following:
#
# Letters as defined in the Unicode Standard 3.2.
#
# Decimal numbers from either Basic Latin or other national scripts.
#
# The at sign, dollar sign ($), number sign, or underscore.
#
# One of the problems is that letters and numbers can be any which are
# valid for the locale in use, but how do we know for sure that locale
# setting of the process and what is used by the regular expression
# library even matches what the database is using. We therefore have to
# avoid trying to use \w pattern even if LOCALE flag is used.
#
# On top of the character set issues and quoting, different types of
# bracketing such as () and [] can also be used around names.
#
# Because of the difficulty in handling locales especially, what we do
# instead is try and match based on whatever occurs between the
# different delimiters we expect. That way we do not have to worry about
# locale.
#
# In the case of a schema and table reference, when quoting is used,
# the same type of quoting must be used on each in the one statement.
# You cannot mix different quoting schemes as then the regex gets
# even more messy.
def _parse_default(sql, regex):
match = regex.search(sql)
return match and _extract_identifier(match.group(1)) or ''
_parse_identifier_1_p = r'"((?:[^"]|"")+)"(?:\."((?:[^"]|"")+)")?'
_parse_identifier_2_p = r"'((?:[^']|'')+)'(?:\.'((?:[^']|'')+)')?"
_parse_identifier_3_p = r'`((?:[^`]|``)+)`(?:\.`((?:[^`]|``)+)`)?'
_parse_identifier_4_p = r'\[\s*(\S+)\s*\]'
_parse_identifier_5_p = r'\(\s*(\S+)\s*\)'
_parse_identifier_6_p = r'([^\s\(\)\[\],]+)'
_parse_identifier_p = ''.join(('(', _parse_identifier_1_p, '|',
_parse_identifier_2_p, '|', _parse_identifier_3_p, '|',
_parse_identifier_4_p, '|', _parse_identifier_5_p, '|',
_parse_identifier_6_p, ')'))
_parse_from_p = '\s+FROM\s+' + _parse_identifier_p
_parse_from_re = re.compile(_parse_from_p, re.IGNORECASE)
def _join_identifier(m):
return m and '.'.join([s for s in m.groups()[1:] if s]).lower() or ''
@internal_trace('Supportability/Python/DatabaseUtils/Calls/'
'parse_target_select')
def _parse_select(sql):
return _join_identifier(_parse_from_re.search(sql))
@internal_trace('Supportability/Python/DatabaseUtils/Calls/'
'parse_target_delete')
def _parse_delete(sql):
return _join_identifier(_parse_from_re.search(sql))
_parse_into_p = '\s+INTO\s+' + _parse_identifier_p
_parse_into_re = re.compile(_parse_into_p, re.IGNORECASE)
@internal_trace('Supportability/Python/DatabaseUtils/Calls/'
'parse_target_insert')
def _parse_insert(sql):
return _join_identifier(_parse_into_re.search(sql))
_parse_update_p = '\s*UPDATE\s+' + _parse_identifier_p
_parse_update_re = re.compile(_parse_update_p, re.IGNORECASE)
@internal_trace('Supportability/Python/DatabaseUtils/Calls/'
'parse_target_update')
def _parse_update(sql):
return _join_identifier(_parse_update_re.search(sql))
_parse_table_p = '\s+TABLE\s+' + _parse_identifier_p
_parse_table_re = re.compile(_parse_table_p, re.IGNORECASE)
@internal_trace('Supportability/Python/DatabaseUtils/Calls/'
'parse_target_create')
def _parse_create(sql):
return _join_identifier(_parse_table_re.search(sql))
@internal_trace('Supportability/Python/DatabaseUtils/Calls/'
'parse_target_drop')
def _parse_drop(sql):
return _join_identifier(_parse_table_re.search(sql))
_parse_call_p = r'\s*CALL\s+(?!\()(\w+)'
_parse_call_re = re.compile(_parse_call_p, re.IGNORECASE)
@internal_trace('Supportability/Python/DatabaseUtils/Calls/'
'parse_target_call')
def _parse_call(sql):
return _parse_default(sql, _parse_call_re)
# TODO Following need to be reviewed again. They aren't currently used
# in actual use as only parse out target for select/insert/update/delete.
_parse_show_p = r'\s*SHOW\s+(.*)'
_parse_show_re = re.compile(_parse_show_p, re.IGNORECASE | re.DOTALL)
@internal_trace('Supportability/Python/DatabaseUtils/Calls/'
'parse_target_show')
def _parse_show(sql):
return _parse_default(sql, _parse_show_re)
_parse_set_p = r'\s*SET\s+(.*?)\W+.*'
_parse_set_re = re.compile(_parse_set_p, re.IGNORECASE | re.DOTALL)
@internal_trace('Supportability/Python/DatabaseUtils/Calls/'
'parse_target_set')
def _parse_set(sql):
return _parse_default(sql, _parse_set_re)
_parse_exec_p = r'\s*EXEC\s+(?!\()(\w+)'
_parse_exec_re = re.compile(_parse_exec_p, re.IGNORECASE)
@internal_trace('Supportability/Python/DatabaseUtils/Calls/'
'parse_target_exec')
def _parse_exec(sql):
return _parse_default(sql, _parse_exec_re)
_parse_execute_p = r'\s*EXECUTE\s+(?!\()(\w+)'
_parse_execute_re = re.compile(_parse_execute_p, re.IGNORECASE)
@internal_trace('Supportability/Python/DatabaseUtils/Calls/'
'parse_target_execute')
def _parse_execute(sql):
return _parse_default(sql, _parse_execute_re)
_parse_alter_p = r'\s*ALTER\s+(?!\()(\w+)'
_parse_alter_re = re.compile(_parse_alter_p, re.IGNORECASE)
@internal_trace('Supportability/Python/DatabaseUtils/Calls/'
'parse_target_alter')
def _parse_alter(sql):
return _parse_default(sql, _parse_alter_re)
# For SQL queries, if a target of some sort, such as a table can be
# meaningfully extracted, then this table should map to the function
# which extracts it. If no target can be extracted, but it is still
# desired that the operation be broken out separately with new Datastore
# metrics, then the operation should still be added, but with the value
# being set to None.
_operation_table = {
'select': _parse_select,
'delete': _parse_delete,
'insert': _parse_insert,
'update': _parse_update,
'create': None,
'drop': None,
'call': _parse_call,
'show': None,
'set': None,
'exec': None,
'execute': None,
'alter': None,
'commit': None,
'rollback': None,
}
_parse_operation_p = r'(\w+)'
_parse_operation_re = re.compile(_parse_operation_p)
@internal_trace('Supportability/Python/DatabaseUtils/Calls/parse_operation')
def _parse_operation(sql):
match = _parse_operation_re.search(sql)
operation = match and match.group(1).lower() or ''
return operation if operation in _operation_table else ''
@internal_trace('Supportability/Python/DatabaseUtils/Calls/parse_target')
def _parse_target(sql, operation):
parse = _operation_table.get(operation, None)
return parse and parse(sql) or ''
# For explain plan obfuscation, the regular expression for matching the
# explain plan needs to give precedence to replacing double quotes from
# around table names, then single quotes from any text, typed or
# otherwise. We will swap single quotes with a token value. Next up we
# want to match anything we want to keep. Finally match all remaining
# numeric values. These we will also swap with a token value.
_explain_plan_postgresql_re_1_mask_false = re.compile(
r"""((?P<double_quotes>"[^"]*")|"""
r"""(?P<single_quotes>'([^']|'')*')|"""
r"""(?P<cost_analysis>\(cost=[^)]*\))|"""
r"""(?P<sub_plan_ref>\bSubPlan\s+\d+\b)|"""
r"""(?P<init_plan_ref>\bInitPlan\s+\d+\b)|"""
r"""(?P<dollar_var_ref>\$\d+\b)|"""
r"""(?P<numeric_value>(?<![\w])[-+]?\d*\.?\d+([eE][-+]?\d+)?\b))""")
_explain_plan_postgresql_re_1_mask_true = re.compile(
r"""((?P<double_quotes>"[^"]*")|"""
r"""(?P<single_quotes>'([^']|'')*'))""")
_explain_plan_postgresql_re_2 = re.compile(
r"""^(?P<label>[^:]*:\s+).*$""", re.MULTILINE)
def _obfuscate_explain_plan_postgresql_substitute(text, mask):
# Perform substitutions for the explain plan on the text string.
def replacement(matchobj):
# The replacement function is called for each match. The group
# dict of the match object will have a key corresponding to all
# groups that could have matched, but only the first encountered
# based on order of sub patterns will have non None value. We
# use the name of the sub pattern to determine if we keep the
# original value or swap it with our token value.
for name, value in list(matchobj.groupdict().items()):
if value is not None:
if name in ('numeric_value', 'single_quotes'):
return '?'
return value
if mask:
return _explain_plan_postgresql_re_1_mask_true.sub(replacement, text)
else:
return _explain_plan_postgresql_re_1_mask_false.sub(replacement, text)
def _obfuscate_explain_plan_postgresql(columns, rows, mask=None):
settings = global_settings()
if mask is None:
mask = (settings.debug.explain_plan_obfuscation == 'simple')
# Only deal with where we get back the one expected column. If we
# get more than one column just ignore the whole explain plan. Need
# to confirm whether we would always definitely only get one column.
# The reason we do this is that swapping the value of quoted strings
# could result in the collapsing of multiple rows and in that case
# not sure what we would do with values from any other columns.
if len(columns) != 1:
return None
# We need to join together all the separate rows of the explain plan
# back together again. This is because an embedded newline within
# any text quoted from the original SQL can result in that line of
# the explain plan being split across multiple rows.
text = '\n'.join(item[0] for item in rows)
# Now need to perform the replacements on the complete text of the
# explain plan.
text = _obfuscate_explain_plan_postgresql_substitute(text, mask)
# The mask option dictates whether we use the slightly more aggresive
# obfuscation and simply mask out any line preceded by a label.
if mask:
text = _explain_plan_postgresql_re_2.sub('\g<label>?', text)
# Now regenerate the list of rows by splitting again on newline.
rows = [(_,) for _ in text.split('\n')]
return columns, rows
_obfuscate_explain_plan_table = {
'Postgres': _obfuscate_explain_plan_postgresql
}
def _obfuscate_explain_plan(database, columns, rows):
obfuscator = _obfuscate_explain_plan_table.get(database.name)
if obfuscator:
return obfuscator(columns, rows)
return columns, rows
class SQLConnection(object):
def __init__(self, database, connection):
self.database = database
self.connection = connection
self.cursors = {}
def cursor(self, args=(), kwargs={}):
key = (args, frozenset(kwargs.items()))
cursor = self.cursors.get(key)
if cursor is None:
settings = global_settings()
if settings.debug.log_explain_plan_queries:
_logger.debug('Created database cursor for %r.',
self.database.client)
cursor = self.connection.cursor(*args, **kwargs)
self.cursors[key] = cursor
return cursor
def cleanup(self):
settings = global_settings()
if settings.debug.log_explain_plan_queries:
_logger.debug('Cleanup database connection for %r.',
self.database)
try:
self.connection.rollback()
pass
except (AttributeError, self.database.NotSupportedError):
pass
self.connection.close()
class SQLConnections(object):
def __init__(self, maximum=4):
self.connections = []
self.maximum = maximum
settings = global_settings()
if settings.debug.log_explain_plan_queries:
_logger.debug('Creating SQL connections cache %r.', self)
def connection(self, database, args, kwargs):
key = (database.client, args, kwargs)
connection = None
settings = global_settings()
for i, item in enumerate(self.connections):
if item[0] == key:
connection = item[1]
# Move to back of list so we know which is the
# most recently used all the time.
item = self.connections.pop(i)
self.connections.append(item)
break
if connection is None:
# If we are at the maximum number of connections to
# keep hold of, pop the one which has been used the
# longest amount of time.
if len(self.connections) == self.maximum:
connection = self.connections.pop(0)[1]
internal_metric('Supportability/Python/DatabaseUtils/Counts/'
'drop_database_connection', 1)
if settings.debug.log_explain_plan_queries:
_logger.debug('Drop database connection for %r as '
'reached maximum of %r.',
connection.database.client, self.maximum)
connection.cleanup()
connection = SQLConnection(database,
database.connect(*args, **kwargs))
self.connections.append((key, connection))
internal_metric('Supportability/Python/DatabaseUtils/Counts/'
'create_database_connection', 1)
if settings.debug.log_explain_plan_queries:
_logger.debug('Created database connection for %r.',
database.client)
return connection
def cleanup(self):
settings = global_settings()
if settings.debug.log_explain_plan_queries:
_logger.debug('Cleaning up SQL connections cache %r.', self)
for key, connection in self.connections:
connection.cleanup()
self.connections = []
def __enter__(self):
return self
def __exit__(self, exc, value, tb):
self.cleanup()
def _query_result_dicts_to_tuples(columns, rows):
# Query results come back as a list of rows. If each row is a
# dict, then its keys can be found in the columns list. Here, we
# transform each row from a dict to a tuple, ordering the items
# in the row in the same order as that found in columns.
# Handle case where query results are empty.
if not columns or not rows:
return None
return [tuple([row[col] for col in columns]) for row in rows]
@internal_trace('Supportability/Python/DatabaseUtils/Calls/explain_plan')
def _explain_plan(connections, sql, database, connect_params, cursor_params,
sql_parameters, execute_params):
query = '%s %s' % (database.explain_query, sql)
settings = global_settings()
if settings.debug.log_explain_plan_queries:
_logger.debug('Executing explain plan for %r on %r.', query,
database.client)
try:
args, kwargs = connect_params
connection = connections.connection(database, args, kwargs)
if cursor_params is not None:
args, kwargs = cursor_params
cursor = connection.cursor(args, kwargs)
else:
cursor = connection.cursor()
if execute_params is not None:
args, kwargs = execute_params
else:
args, kwargs = ((), {})
# If sql_parameters is None them args would need
# to be an empty sequence. Don't pass it just in
# case it wasn't for some reason, and only supply
# kwargs. Right now the only time we believe that
# passing in further params is needed is with
# oursql cursor execute() method, which has
# proprietary arguments outside of the DBAPI2
# specification.
if sql_parameters is not None:
cursor.execute(query, sql_parameters, *args, **kwargs)
else:
cursor.execute(query, **kwargs)
columns = []
if cursor.description:
for column in cursor.description:
columns.append(column[0])
rows = cursor.fetchall()
# If rows have been returned as a list of dicts, then convert
# them to a list of tuples before returning.
if settings.debug.log_explain_plan_queries:
_logger.debug('Explain plan row data type is %r',
rows and type(rows[0]))
if rows and isinstance(rows[0], dict):
rows = _query_result_dicts_to_tuples(columns, rows)
if not columns and not rows:
return None
return (columns, rows)
except Exception:
if settings.debug.log_explain_plan_queries:
_logger.exception('Error occurred when executing explain '
'plan for %r on %r where cursor_params=%r and '
'execute_params=%r.', query, database.client,
cursor_params, execute_params)
return None
def explain_plan(connections, sql_statement, connect_params, cursor_params,
sql_parameters, execute_params, sql_format):
# If no parameters supplied for creating database connection
# then mustn't have been a candidate for explain plans in the
# first place, so skip it.
if connect_params is None:
return
# Determine if we even know how to perform explain plans for
# this particular database.
database = sql_statement.database
if sql_statement.operation not in database.explain_stmts:
return
details = _explain_plan(connections, sql_statement.sql, database,
connect_params, cursor_params, sql_parameters, execute_params)
if details is not None and sql_format != 'raw':
return _obfuscate_explain_plan(database, *details)
return details
# Wrapper for information about a specific database.
class SQLDatabase(object):
def __init__(self, dbapi2_module):
self.dbapi2_module = dbapi2_module
def __getattr__(self, name):
return getattr(self.dbapi2_module, name)
@property
def name(self):
return getattr(self.dbapi2_module, '_nr_database_name', None)
@property
def client(self):
name = getattr(self.dbapi2_module, '__name__', None)
if name is None:
name = getattr(self.dbapi2_module, '__file__', None)
if name is None:
name = str(self.dbapi2_module)
return name
@property
def quoting_style(self):
result = getattr(self.dbapi2_module, '_nr_quoting_style', None)
if result is None:
result = 'single'
return result
@property
def explain_query(self):
return getattr(self.dbapi2_module, '_nr_explain_query', None)
@property
def explain_stmts(self):
result = getattr(self.dbapi2_module, '_nr_explain_stmts', None)
if result is None:
result = ()
return result
class SQLStatement(object):
def __init__(self, sql, database=None):
self.sql = sql
self.database = database
self._operation = None
self._target = None
self._uncommented = None
self._obfuscated = None
self._normalized = None
self._identifier = None
@property
def operation(self):
if self._operation is None:
self._operation = _parse_operation(self.uncommented)
return self._operation
@property
def target(self):
if self._target is None:
self._target = _parse_target(self.uncommented, self.operation)
return self._target
@property
def uncommented(self):
if self._uncommented is None:
self._uncommented = _uncomment_sql(self.sql)
return self._uncommented
@property
def obfuscated(self):
if self._obfuscated is None:
self._obfuscated = _obfuscate_sql(self.uncommented, self.database)
return self._obfuscated
@property
def normalized(self):
if self._normalized is None:
self._normalized = _normalize_sql(self.obfuscated)
return self._normalized
@property
def identifier(self):
if self._identifier is None:
self._identifier = hash(self.normalized)
return self._identifier
def formatted(self, sql_format):
if sql_format == 'off':
return ''
elif sql_format == 'raw':
return self.sql
else:
return self.obfuscated
_sql_statements = weakref.WeakValueDictionary()
def sql_statement(sql, dbapi2_module):
key = (sql, dbapi2_module)
result = _sql_statements.get(key, None)
if result is not None:
return result
database = SQLDatabase(dbapi2_module)
result = SQLStatement(sql, database)
_sql_statements[key] = result
return result