Updated script that can be controled by Nodejs web app

This commit is contained in:
mac OS
2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions

View File

@ -0,0 +1,213 @@
"""
Core eval alignment algorithms.
"""
from __future__ import annotations
from functools import (
partial,
wraps,
)
from typing import (
TYPE_CHECKING,
Callable,
)
import warnings
import numpy as np
from pandas.errors import PerformanceWarning
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
)
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.computation.common import result_type_many
if TYPE_CHECKING:
from collections.abc import Sequence
from pandas._typing import F
from pandas.core.generic import NDFrame
from pandas.core.indexes.api import Index
def _align_core_single_unary_op(
term,
) -> tuple[partial | type[NDFrame], dict[str, Index] | None]:
typ: partial | type[NDFrame]
axes: dict[str, Index] | None = None
if isinstance(term.value, np.ndarray):
typ = partial(np.asanyarray, dtype=term.value.dtype)
else:
typ = type(term.value)
if hasattr(term.value, "axes"):
axes = _zip_axes_from_type(typ, term.value.axes)
return typ, axes
def _zip_axes_from_type(
typ: type[NDFrame], new_axes: Sequence[Index]
) -> dict[str, Index]:
return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)}
def _any_pandas_objects(terms) -> bool:
"""
Check a sequence of terms for instances of PandasObject.
"""
return any(isinstance(term.value, PandasObject) for term in terms)
def _filter_special_cases(f) -> Callable[[F], F]:
@wraps(f)
def wrapper(terms):
# single unary operand
if len(terms) == 1:
return _align_core_single_unary_op(terms[0])
term_values = (term.value for term in terms)
# we don't have any pandas objects
if not _any_pandas_objects(terms):
return result_type_many(*term_values), None
return f(terms)
return wrapper
@_filter_special_cases
def _align_core(terms):
term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")]
term_dims = [terms[i].value.ndim for i in term_index]
from pandas import Series
ndims = Series(dict(zip(term_index, term_dims)))
# initial axes are the axes of the largest-axis'd term
biggest = terms[ndims.idxmax()].value
typ = biggest._constructor
axes = biggest.axes
naxes = len(axes)
gt_than_one_axis = naxes > 1
for value in (terms[i].value for i in term_index):
is_series = isinstance(value, ABCSeries)
is_series_and_gt_one_axis = is_series and gt_than_one_axis
for axis, items in enumerate(value.axes):
if is_series_and_gt_one_axis:
ax, itm = naxes - 1, value.index
else:
ax, itm = axis, items
if not axes[ax].is_(itm):
axes[ax] = axes[ax].union(itm)
for i, ndim in ndims.items():
for axis, items in zip(range(ndim), axes):
ti = terms[i].value
if hasattr(ti, "reindex"):
transpose = isinstance(ti, ABCSeries) and naxes > 1
reindexer = axes[naxes - 1] if transpose else items
term_axis_size = len(ti.axes[axis])
reindexer_size = len(reindexer)
ordm = np.log10(max(1, abs(reindexer_size - term_axis_size)))
if ordm >= 1 and reindexer_size >= 10000:
w = (
f"Alignment difference on axis {axis} is larger "
f"than an order of magnitude on term {repr(terms[i].name)}, "
f"by more than {ordm:.4g}; performance may suffer."
)
warnings.warn(
w, category=PerformanceWarning, stacklevel=find_stack_level()
)
obj = ti.reindex(reindexer, axis=axis, copy=False)
terms[i].update(obj)
terms[i].update(terms[i].value.values)
return typ, _zip_axes_from_type(typ, axes)
def align_terms(terms):
"""
Align a set of terms.
"""
try:
# flatten the parse tree (a nested list, really)
terms = list(com.flatten(terms))
except TypeError:
# can't iterate so it must just be a constant or single variable
if isinstance(terms.value, (ABCSeries, ABCDataFrame)):
typ = type(terms.value)
return typ, _zip_axes_from_type(typ, terms.value.axes)
return np.result_type(terms.type), None
# if all resolved variables are numeric scalars
if all(term.is_scalar for term in terms):
return result_type_many(*(term.value for term in terms)).type, None
# perform the main alignment
typ, axes = _align_core(terms)
return typ, axes
def reconstruct_object(typ, obj, axes, dtype):
"""
Reconstruct an object given its type, raw value, and possibly empty
(None) axes.
Parameters
----------
typ : object
A type
obj : object
The value to use in the type constructor
axes : dict
The axes to use to construct the resulting pandas object
Returns
-------
ret : typ
An object of type ``typ`` with the value `obj` and possible axes
`axes`.
"""
try:
typ = typ.type
except AttributeError:
pass
res_t = np.result_type(obj.dtype, dtype)
if not isinstance(typ, partial) and issubclass(typ, PandasObject):
return typ(obj, dtype=res_t, **axes)
# special case for pathological things like ~True/~False
if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_:
ret_value = res_t.type(obj)
else:
ret_value = typ(obj).astype(res_t)
# The condition is to distinguish 0-dim array (returned in case of
# scalar) and 1 element array
# e.g. np.array(0) and np.array([0])
if (
len(obj.shape) == 1
and len(obj) == 1
and not isinstance(ret_value, np.ndarray)
):
ret_value = np.array([ret_value]).astype(res_t)
return ret_value

View File

@ -0,0 +1,2 @@
__all__ = ["eval"]
from pandas.core.computation.eval import eval

View File

@ -0,0 +1,8 @@
from __future__ import annotations
from pandas.compat._optional import import_optional_dependency
ne = import_optional_dependency("numexpr", errors="warn")
NUMEXPR_INSTALLED = ne is not None
__all__ = ["NUMEXPR_INSTALLED"]

View File

@ -0,0 +1,48 @@
from __future__ import annotations
from functools import reduce
import numpy as np
from pandas._config import get_option
def ensure_decoded(s) -> str:
"""
If we have bytes, decode them to unicode.
"""
if isinstance(s, (np.bytes_, bytes)):
s = s.decode(get_option("display.encoding"))
return s
def result_type_many(*arrays_and_dtypes):
"""
Wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32)
argument limit.
"""
try:
return np.result_type(*arrays_and_dtypes)
except ValueError:
# we have > NPY_MAXARGS terms in our expression
return reduce(np.result_type, arrays_and_dtypes)
except TypeError:
from pandas.core.dtypes.cast import find_common_type
from pandas.core.dtypes.common import is_extension_array_dtype
arr_and_dtypes = list(arrays_and_dtypes)
ea_dtypes, non_ea_dtypes = [], []
for arr_or_dtype in arr_and_dtypes:
if is_extension_array_dtype(arr_or_dtype):
ea_dtypes.append(arr_or_dtype)
else:
non_ea_dtypes.append(arr_or_dtype)
if non_ea_dtypes:
try:
np_dtype = np.result_type(*non_ea_dtypes)
except ValueError:
np_dtype = reduce(np.result_type, arrays_and_dtypes)
return find_common_type(ea_dtypes + [np_dtype])
return find_common_type(ea_dtypes)

View File

@ -0,0 +1,143 @@
"""
Engine classes for :func:`~pandas.eval`
"""
from __future__ import annotations
import abc
from typing import TYPE_CHECKING
from pandas.errors import NumExprClobberingError
from pandas.core.computation.align import (
align_terms,
reconstruct_object,
)
from pandas.core.computation.ops import (
MATHOPS,
REDUCTIONS,
)
from pandas.io.formats import printing
if TYPE_CHECKING:
from pandas.core.computation.expr import Expr
_ne_builtins = frozenset(MATHOPS + REDUCTIONS)
def _check_ne_builtin_clash(expr: Expr) -> None:
"""
Attempt to prevent foot-shooting in a helpful way.
Parameters
----------
expr : Expr
Terms can contain
"""
names = expr.names
overlap = names & _ne_builtins
if overlap:
s = ", ".join([repr(x) for x in overlap])
raise NumExprClobberingError(
f'Variables in expression "{expr}" overlap with builtins: ({s})'
)
class AbstractEngine(metaclass=abc.ABCMeta):
"""Object serving as a base class for all engines."""
has_neg_frac = False
def __init__(self, expr) -> None:
self.expr = expr
self.aligned_axes = None
self.result_type = None
def convert(self) -> str:
"""
Convert an expression for evaluation.
Defaults to return the expression as a string.
"""
return printing.pprint_thing(self.expr)
def evaluate(self) -> object:
"""
Run the engine on the expression.
This method performs alignment which is necessary no matter what engine
is being used, thus its implementation is in the base class.
Returns
-------
object
The result of the passed expression.
"""
if not self._is_aligned:
self.result_type, self.aligned_axes = align_terms(self.expr.terms)
# make sure no names in resolvers and locals/globals clash
res = self._evaluate()
return reconstruct_object(
self.result_type, res, self.aligned_axes, self.expr.terms.return_type
)
@property
def _is_aligned(self) -> bool:
return self.aligned_axes is not None and self.result_type is not None
@abc.abstractmethod
def _evaluate(self):
"""
Return an evaluated expression.
Parameters
----------
env : Scope
The local and global environment in which to evaluate an
expression.
Notes
-----
Must be implemented by subclasses.
"""
class NumExprEngine(AbstractEngine):
"""NumExpr engine class"""
has_neg_frac = True
def _evaluate(self):
import numexpr as ne
# convert the expression to a valid numexpr expression
s = self.convert()
env = self.expr.env
scope = env.full_scope
_check_ne_builtin_clash(self.expr)
return ne.evaluate(s, local_dict=scope)
class PythonEngine(AbstractEngine):
"""
Evaluate an expression in Python space.
Mostly for testing purposes.
"""
has_neg_frac = False
def evaluate(self):
return self.expr()
def _evaluate(self) -> None:
pass
ENGINES: dict[str, type[AbstractEngine]] = {
"numexpr": NumExprEngine,
"python": PythonEngine,
}

View File

@ -0,0 +1,415 @@
"""
Top level ``eval`` module.
"""
from __future__ import annotations
import tokenize
from typing import TYPE_CHECKING
import warnings
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_bool_kwarg
from pandas.core.dtypes.common import is_extension_array_dtype
from pandas.core.computation.engines import ENGINES
from pandas.core.computation.expr import (
PARSERS,
Expr,
)
from pandas.core.computation.parsing import tokenize_string
from pandas.core.computation.scope import ensure_scope
from pandas.core.generic import NDFrame
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
from pandas.core.computation.ops import BinOp
def _check_engine(engine: str | None) -> str:
"""
Make sure a valid engine is passed.
Parameters
----------
engine : str
String to validate.
Raises
------
KeyError
* If an invalid engine is passed.
ImportError
* If numexpr was requested but doesn't exist.
Returns
-------
str
Engine name.
"""
from pandas.core.computation.check import NUMEXPR_INSTALLED
from pandas.core.computation.expressions import USE_NUMEXPR
if engine is None:
engine = "numexpr" if USE_NUMEXPR else "python"
if engine not in ENGINES:
valid_engines = list(ENGINES.keys())
raise KeyError(
f"Invalid engine '{engine}' passed, valid engines are {valid_engines}"
)
# TODO: validate this in a more general way (thinking of future engines
# that won't necessarily be import-able)
# Could potentially be done on engine instantiation
if engine == "numexpr" and not NUMEXPR_INSTALLED:
raise ImportError(
"'numexpr' is not installed or an unsupported version. Cannot use "
"engine='numexpr' for query/eval if 'numexpr' is not installed"
)
return engine
def _check_parser(parser: str):
"""
Make sure a valid parser is passed.
Parameters
----------
parser : str
Raises
------
KeyError
* If an invalid parser is passed
"""
if parser not in PARSERS:
raise KeyError(
f"Invalid parser '{parser}' passed, valid parsers are {PARSERS.keys()}"
)
def _check_resolvers(resolvers):
if resolvers is not None:
for resolver in resolvers:
if not hasattr(resolver, "__getitem__"):
name = type(resolver).__name__
raise TypeError(
f"Resolver of type '{name}' does not "
"implement the __getitem__ method"
)
def _check_expression(expr):
"""
Make sure an expression is not an empty string
Parameters
----------
expr : object
An object that can be converted to a string
Raises
------
ValueError
* If expr is an empty string
"""
if not expr:
raise ValueError("expr cannot be an empty string")
def _convert_expression(expr) -> str:
"""
Convert an object to an expression.
This function converts an object to an expression (a unicode string) and
checks to make sure it isn't empty after conversion. This is used to
convert operators to their string representation for recursive calls to
:func:`~pandas.eval`.
Parameters
----------
expr : object
The object to be converted to a string.
Returns
-------
str
The string representation of an object.
Raises
------
ValueError
* If the expression is empty.
"""
s = pprint_thing(expr)
_check_expression(s)
return s
def _check_for_locals(expr: str, stack_level: int, parser: str):
at_top_of_stack = stack_level == 0
not_pandas_parser = parser != "pandas"
if not_pandas_parser:
msg = "The '@' prefix is only supported by the pandas parser"
elif at_top_of_stack:
msg = (
"The '@' prefix is not allowed in top-level eval calls.\n"
"please refer to your variables by name without the '@' prefix."
)
if at_top_of_stack or not_pandas_parser:
for toknum, tokval in tokenize_string(expr):
if toknum == tokenize.OP and tokval == "@":
raise SyntaxError(msg)
def eval(
expr: str | BinOp, # we leave BinOp out of the docstr bc it isn't for users
parser: str = "pandas",
engine: str | None = None,
local_dict=None,
global_dict=None,
resolvers=(),
level: int = 0,
target=None,
inplace: bool = False,
):
"""
Evaluate a Python expression as a string using various backends.
The following arithmetic operations are supported: ``+``, ``-``, ``*``,
``/``, ``**``, ``%``, ``//`` (python engine only) along with the following
boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not).
Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,
:keyword:`or`, and :keyword:`not` with the same semantics as the
corresponding bitwise operators. :class:`~pandas.Series` and
:class:`~pandas.DataFrame` objects are supported and behave as they would
with plain ol' Python evaluation.
Parameters
----------
expr : str
The expression to evaluate. This string cannot contain any Python
`statements
<https://docs.python.org/3/reference/simple_stmts.html#simple-statements>`__,
only Python `expressions
<https://docs.python.org/3/reference/simple_stmts.html#expression-statements>`__.
parser : {'pandas', 'python'}, default 'pandas'
The parser to use to construct the syntax tree from the expression. The
default of ``'pandas'`` parses code slightly different than standard
Python. Alternatively, you can parse an expression using the
``'python'`` parser to retain strict Python semantics. See the
:ref:`enhancing performance <enhancingperf.eval>` documentation for
more details.
engine : {'python', 'numexpr'}, default 'numexpr'
The engine used to evaluate the expression. Supported engines are
- None : tries to use ``numexpr``, falls back to ``python``
- ``'numexpr'`` : This default engine evaluates pandas objects using
numexpr for large speed ups in complex expressions with large frames.
- ``'python'`` : Performs operations as if you had ``eval``'d in top
level python. This engine is generally not that useful.
More backends may be available in the future.
local_dict : dict or None, optional
A dictionary of local variables, taken from locals() by default.
global_dict : dict or None, optional
A dictionary of global variables, taken from globals() by default.
resolvers : list of dict-like or None, optional
A list of objects implementing the ``__getitem__`` special method that
you can use to inject an additional collection of namespaces to use for
variable lookup. For example, this is used in the
:meth:`~DataFrame.query` method to inject the
``DataFrame.index`` and ``DataFrame.columns``
variables that refer to their respective :class:`~pandas.DataFrame`
instance attributes.
level : int, optional
The number of prior stack frames to traverse and add to the current
scope. Most users will **not** need to change this parameter.
target : object, optional, default None
This is the target object for assignment. It is used when there is
variable assignment in the expression. If so, then `target` must
support item assignment with string keys, and if a copy is being
returned, it must also support `.copy()`.
inplace : bool, default False
If `target` is provided, and the expression mutates `target`, whether
to modify `target` inplace. Otherwise, return a copy of `target` with
the mutation.
Returns
-------
ndarray, numeric scalar, DataFrame, Series, or None
The completion value of evaluating the given code or None if ``inplace=True``.
Raises
------
ValueError
There are many instances where such an error can be raised:
- `target=None`, but the expression is multiline.
- The expression is multiline, but not all them have item assignment.
An example of such an arrangement is this:
a = b + 1
a + 2
Here, there are expressions on different lines, making it multiline,
but the last line has no variable assigned to the output of `a + 2`.
- `inplace=True`, but the expression is missing item assignment.
- Item assignment is provided, but the `target` does not support
string item assignment.
- Item assignment is provided and `inplace=False`, but the `target`
does not support the `.copy()` method
See Also
--------
DataFrame.query : Evaluates a boolean expression to query the columns
of a frame.
DataFrame.eval : Evaluate a string describing operations on
DataFrame columns.
Notes
-----
The ``dtype`` of any objects involved in an arithmetic ``%`` operation are
recursively cast to ``float64``.
See the :ref:`enhancing performance <enhancingperf.eval>` documentation for
more details.
Examples
--------
>>> df = pd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]})
>>> df
animal age
0 dog 10
1 pig 20
We can add a new column using ``pd.eval``:
>>> pd.eval("double_age = df.age * 2", target=df)
animal age double_age
0 dog 10 20
1 pig 20 40
"""
inplace = validate_bool_kwarg(inplace, "inplace")
exprs: list[str | BinOp]
if isinstance(expr, str):
_check_expression(expr)
exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""]
else:
# ops.BinOp; for internal compat, not intended to be passed by users
exprs = [expr]
multi_line = len(exprs) > 1
if multi_line and target is None:
raise ValueError(
"multi-line expressions are only valid in the "
"context of data, use DataFrame.eval"
)
engine = _check_engine(engine)
_check_parser(parser)
_check_resolvers(resolvers)
ret = None
first_expr = True
target_modified = False
for expr in exprs:
expr = _convert_expression(expr)
_check_for_locals(expr, level, parser)
# get our (possibly passed-in) scope
env = ensure_scope(
level + 1,
global_dict=global_dict,
local_dict=local_dict,
resolvers=resolvers,
target=target,
)
parsed_expr = Expr(expr, engine=engine, parser=parser, env=env)
if engine == "numexpr" and (
is_extension_array_dtype(parsed_expr.terms.return_type)
or getattr(parsed_expr.terms, "operand_types", None) is not None
and any(
is_extension_array_dtype(elem)
for elem in parsed_expr.terms.operand_types
)
):
warnings.warn(
"Engine has switched to 'python' because numexpr does not support "
"extension array dtypes. Please set your engine to python manually.",
RuntimeWarning,
stacklevel=find_stack_level(),
)
engine = "python"
# construct the engine and evaluate the parsed expression
eng = ENGINES[engine]
eng_inst = eng(parsed_expr)
ret = eng_inst.evaluate()
if parsed_expr.assigner is None:
if multi_line:
raise ValueError(
"Multi-line expressions are only valid "
"if all expressions contain an assignment"
)
if inplace:
raise ValueError("Cannot operate inplace if there is no assignment")
# assign if needed
assigner = parsed_expr.assigner
if env.target is not None and assigner is not None:
target_modified = True
# if returning a copy, copy only on the first assignment
if not inplace and first_expr:
try:
target = env.target
if isinstance(target, NDFrame):
target = target.copy(deep=None)
else:
target = target.copy()
except AttributeError as err:
raise ValueError("Cannot return a copy of the target") from err
else:
target = env.target
# TypeError is most commonly raised (e.g. int, list), but you
# get IndexError if you try to do this assignment on np.ndarray.
# we will ignore numpy warnings here; e.g. if trying
# to use a non-numeric indexer
try:
if inplace and isinstance(target, NDFrame):
target.loc[:, assigner] = ret
else:
target[assigner] = ret # pyright: ignore[reportGeneralTypeIssues]
except (TypeError, IndexError) as err:
raise ValueError("Cannot assign expression output to target") from err
if not resolvers:
resolvers = ({assigner: ret},)
else:
# existing resolver needs updated to handle
# case of mutating existing column in copy
for resolver in resolvers:
if assigner in resolver:
resolver[assigner] = ret
break
else:
resolvers += ({assigner: ret},)
ret = None
first_expr = False
# We want to exclude `inplace=None` as being False.
if inplace is False:
return target if target_modified else ret

View File

@ -0,0 +1,836 @@
"""
:func:`~pandas.eval` parsers.
"""
from __future__ import annotations
import ast
from functools import (
partial,
reduce,
)
from keyword import iskeyword
import tokenize
from typing import (
Callable,
ClassVar,
TypeVar,
)
import numpy as np
from pandas.errors import UndefinedVariableError
import pandas.core.common as com
from pandas.core.computation.ops import (
ARITH_OPS_SYMS,
BOOL_OPS_SYMS,
CMP_OPS_SYMS,
LOCAL_TAG,
MATHOPS,
REDUCTIONS,
UNARY_OPS_SYMS,
BinOp,
Constant,
FuncNode,
Op,
Term,
UnaryOp,
is_term,
)
from pandas.core.computation.parsing import (
clean_backtick_quoted_toks,
tokenize_string,
)
from pandas.core.computation.scope import Scope
from pandas.io.formats import printing
def _rewrite_assign(tok: tuple[int, str]) -> tuple[int, str]:
"""
Rewrite the assignment operator for PyTables expressions that use ``=``
as a substitute for ``==``.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
return toknum, "==" if tokval == "=" else tokval
def _replace_booleans(tok: tuple[int, str]) -> tuple[int, str]:
"""
Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise
precedence is changed to boolean precedence.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == tokenize.OP:
if tokval == "&":
return tokenize.NAME, "and"
elif tokval == "|":
return tokenize.NAME, "or"
return toknum, tokval
return toknum, tokval
def _replace_locals(tok: tuple[int, str]) -> tuple[int, str]:
"""
Replace local variables with a syntactically valid name.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
tuple of int, str
Either the input or token or the replacement values
Notes
-----
This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as
``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_``
is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it.
"""
toknum, tokval = tok
if toknum == tokenize.OP and tokval == "@":
return tokenize.OP, LOCAL_TAG
return toknum, tokval
def _compose2(f, g):
"""
Compose 2 callables.
"""
return lambda *args, **kwargs: f(g(*args, **kwargs))
def _compose(*funcs):
"""
Compose 2 or more callables.
"""
assert len(funcs) > 1, "At least 2 callables must be passed to compose"
return reduce(_compose2, funcs)
def _preparse(
source: str,
f=_compose(
_replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks
),
) -> str:
"""
Compose a collection of tokenization functions.
Parameters
----------
source : str
A Python source code string
f : callable
This takes a tuple of (toknum, tokval) as its argument and returns a
tuple with the same structure but possibly different elements. Defaults
to the composition of ``_rewrite_assign``, ``_replace_booleans``, and
``_replace_locals``.
Returns
-------
str
Valid Python source code
Notes
-----
The `f` parameter can be any callable that takes *and* returns input of the
form ``(toknum, tokval)``, where ``toknum`` is one of the constants from
the ``tokenize`` module and ``tokval`` is a string.
"""
assert callable(f), "f must be callable"
return tokenize.untokenize(f(x) for x in tokenize_string(source))
def _is_type(t):
"""
Factory for a type checking function of type ``t`` or tuple of types.
"""
return lambda x: isinstance(x.value, t)
_is_list = _is_type(list)
_is_str = _is_type(str)
# partition all AST nodes
_all_nodes = frozenset(
node
for node in (getattr(ast, name) for name in dir(ast))
if isinstance(node, type) and issubclass(node, ast.AST)
)
def _filter_nodes(superclass, all_nodes=_all_nodes):
"""
Filter out AST nodes that are subclasses of ``superclass``.
"""
node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass))
return frozenset(node_names)
_all_node_names = frozenset(x.__name__ for x in _all_nodes)
_mod_nodes = _filter_nodes(ast.mod)
_stmt_nodes = _filter_nodes(ast.stmt)
_expr_nodes = _filter_nodes(ast.expr)
_expr_context_nodes = _filter_nodes(ast.expr_context)
_boolop_nodes = _filter_nodes(ast.boolop)
_operator_nodes = _filter_nodes(ast.operator)
_unary_op_nodes = _filter_nodes(ast.unaryop)
_cmp_op_nodes = _filter_nodes(ast.cmpop)
_comprehension_nodes = _filter_nodes(ast.comprehension)
_handler_nodes = _filter_nodes(ast.excepthandler)
_arguments_nodes = _filter_nodes(ast.arguments)
_keyword_nodes = _filter_nodes(ast.keyword)
_alias_nodes = _filter_nodes(ast.alias)
# nodes that we don't support directly but are needed for parsing
_hacked_nodes = frozenset(["Assign", "Module", "Expr"])
_unsupported_expr_nodes = frozenset(
[
"Yield",
"GeneratorExp",
"IfExp",
"DictComp",
"SetComp",
"Repr",
"Lambda",
"Set",
"AST",
"Is",
"IsNot",
]
)
# these nodes are low priority or won't ever be supported (e.g., AST)
_unsupported_nodes = (
_stmt_nodes
| _mod_nodes
| _handler_nodes
| _arguments_nodes
| _keyword_nodes
| _alias_nodes
| _expr_context_nodes
| _unsupported_expr_nodes
) - _hacked_nodes
# we're adding a different assignment in some cases to be equality comparison
# and we don't want `stmt` and friends in their so get only the class whose
# names are capitalized
_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes
intersection = _unsupported_nodes & _base_supported_nodes
_msg = f"cannot both support and not support {intersection}"
assert not intersection, _msg
def _node_not_implemented(node_name: str) -> Callable[..., None]:
"""
Return a function that raises a NotImplementedError with a passed node name.
"""
def f(self, *args, **kwargs):
raise NotImplementedError(f"'{node_name}' nodes are not implemented")
return f
# should be bound by BaseExprVisitor but that creates a circular dependency:
# _T is used in disallow, but disallow is used to define BaseExprVisitor
# https://github.com/microsoft/pyright/issues/2315
_T = TypeVar("_T")
def disallow(nodes: set[str]) -> Callable[[type[_T]], type[_T]]:
"""
Decorator to disallow certain nodes from parsing. Raises a
NotImplementedError instead.
Returns
-------
callable
"""
def disallowed(cls: type[_T]) -> type[_T]:
# error: "Type[_T]" has no attribute "unsupported_nodes"
cls.unsupported_nodes = () # type: ignore[attr-defined]
for node in nodes:
new_method = _node_not_implemented(node)
name = f"visit_{node}"
# error: "Type[_T]" has no attribute "unsupported_nodes"
cls.unsupported_nodes += (name,) # type: ignore[attr-defined]
setattr(cls, name, new_method)
return cls
return disallowed
def _op_maker(op_class, op_symbol):
"""
Return a function to create an op class with its symbol already passed.
Returns
-------
callable
"""
def f(self, node, *args, **kwargs):
"""
Return a partial function with an Op subclass with an operator already passed.
Returns
-------
callable
"""
return partial(op_class, op_symbol, *args, **kwargs)
return f
_op_classes = {"binary": BinOp, "unary": UnaryOp}
def add_ops(op_classes):
"""
Decorator to add default implementation of ops.
"""
def f(cls):
for op_attr_name, op_class in op_classes.items():
ops = getattr(cls, f"{op_attr_name}_ops")
ops_map = getattr(cls, f"{op_attr_name}_op_nodes_map")
for op in ops:
op_node = ops_map[op]
if op_node is not None:
made_op = _op_maker(op_class, op)
setattr(cls, f"visit_{op_node}", made_op)
return cls
return f
@disallow(_unsupported_nodes)
@add_ops(_op_classes)
class BaseExprVisitor(ast.NodeVisitor):
"""
Custom ast walker. Parsers of other engines should subclass this class
if necessary.
Parameters
----------
env : Scope
engine : str
parser : str
preparser : callable
"""
const_type: ClassVar[type[Term]] = Constant
term_type: ClassVar[type[Term]] = Term
binary_ops = CMP_OPS_SYMS + BOOL_OPS_SYMS + ARITH_OPS_SYMS
binary_op_nodes = (
"Gt",
"Lt",
"GtE",
"LtE",
"Eq",
"NotEq",
"In",
"NotIn",
"BitAnd",
"BitOr",
"And",
"Or",
"Add",
"Sub",
"Mult",
"Div",
"Pow",
"FloorDiv",
"Mod",
)
binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes))
unary_ops = UNARY_OPS_SYMS
unary_op_nodes = "UAdd", "USub", "Invert", "Not"
unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes))
rewrite_map = {
ast.Eq: ast.In,
ast.NotEq: ast.NotIn,
ast.In: ast.In,
ast.NotIn: ast.NotIn,
}
unsupported_nodes: tuple[str, ...]
def __init__(self, env, engine, parser, preparser=_preparse) -> None:
self.env = env
self.engine = engine
self.parser = parser
self.preparser = preparser
self.assigner = None
def visit(self, node, **kwargs):
if isinstance(node, str):
clean = self.preparser(node)
try:
node = ast.fix_missing_locations(ast.parse(clean))
except SyntaxError as e:
if any(iskeyword(x) for x in clean.split()):
e.msg = "Python keyword not valid identifier in numexpr query"
raise e
method = f"visit_{type(node).__name__}"
visitor = getattr(self, method)
return visitor(node, **kwargs)
def visit_Module(self, node, **kwargs):
if len(node.body) != 1:
raise SyntaxError("only a single expression is allowed")
expr = node.body[0]
return self.visit(expr, **kwargs)
def visit_Expr(self, node, **kwargs):
return self.visit(node.value, **kwargs)
def _rewrite_membership_op(self, node, left, right):
# the kind of the operator (is actually an instance)
op_instance = node.op
op_type = type(op_instance)
# must be two terms and the comparison operator must be ==/!=/in/not in
if is_term(left) and is_term(right) and op_type in self.rewrite_map:
left_list, right_list = map(_is_list, (left, right))
left_str, right_str = map(_is_str, (left, right))
# if there are any strings or lists in the expression
if left_list or right_list or left_str or right_str:
op_instance = self.rewrite_map[op_type]()
# pop the string variable out of locals and replace it with a list
# of one string, kind of a hack
if right_str:
name = self.env.add_tmp([right.value])
right = self.term_type(name, self.env)
if left_str:
name = self.env.add_tmp([left.value])
left = self.term_type(name, self.env)
op = self.visit(op_instance)
return op, op_instance, left, right
def _maybe_transform_eq_ne(self, node, left=None, right=None):
if left is None:
left = self.visit(node.left, side="left")
if right is None:
right = self.visit(node.right, side="right")
op, op_class, left, right = self._rewrite_membership_op(node, left, right)
return op, op_class, left, right
def _maybe_downcast_constants(self, left, right):
f32 = np.dtype(np.float32)
if (
left.is_scalar
and hasattr(left, "value")
and not right.is_scalar
and right.return_type == f32
):
# right is a float32 array, left is a scalar
name = self.env.add_tmp(np.float32(left.value))
left = self.term_type(name, self.env)
if (
right.is_scalar
and hasattr(right, "value")
and not left.is_scalar
and left.return_type == f32
):
# left is a float32 array, right is a scalar
name = self.env.add_tmp(np.float32(right.value))
right = self.term_type(name, self.env)
return left, right
def _maybe_eval(self, binop, eval_in_python):
# eval `in` and `not in` (for now) in "partial" python space
# things that can be evaluated in "eval" space will be turned into
# temporary variables. for example,
# [1,2] in a + 2 * b
# in that case a + 2 * b will be evaluated using numexpr, and the "in"
# call will be evaluated using isin (in python space)
return binop.evaluate(
self.env, self.engine, self.parser, self.term_type, eval_in_python
)
def _maybe_evaluate_binop(
self,
op,
op_class,
lhs,
rhs,
eval_in_python=("in", "not in"),
maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="),
):
res = op(lhs, rhs)
if res.has_invalid_return_type:
raise TypeError(
f"unsupported operand type(s) for {res.op}: "
f"'{lhs.type}' and '{rhs.type}'"
)
if self.engine != "pytables" and (
res.op in CMP_OPS_SYMS
and getattr(lhs, "is_datetime", False)
or getattr(rhs, "is_datetime", False)
):
# all date ops must be done in python bc numexpr doesn't work
# well with NaT
return self._maybe_eval(res, self.binary_ops)
if res.op in eval_in_python:
# "in"/"not in" ops are always evaluated in python
return self._maybe_eval(res, eval_in_python)
elif self.engine != "pytables":
if (
getattr(lhs, "return_type", None) == object
or getattr(rhs, "return_type", None) == object
):
# evaluate "==" and "!=" in python if either of our operands
# has an object return type
return self._maybe_eval(res, eval_in_python + maybe_eval_in_python)
return res
def visit_BinOp(self, node, **kwargs):
op, op_class, left, right = self._maybe_transform_eq_ne(node)
left, right = self._maybe_downcast_constants(left, right)
return self._maybe_evaluate_binop(op, op_class, left, right)
def visit_UnaryOp(self, node, **kwargs):
op = self.visit(node.op)
operand = self.visit(node.operand)
return op(operand)
def visit_Name(self, node, **kwargs) -> Term:
return self.term_type(node.id, self.env, **kwargs)
# TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min
def visit_NameConstant(self, node, **kwargs) -> Term:
return self.const_type(node.value, self.env)
# TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min
def visit_Num(self, node, **kwargs) -> Term:
return self.const_type(node.value, self.env)
def visit_Constant(self, node, **kwargs) -> Term:
return self.const_type(node.value, self.env)
# TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min
def visit_Str(self, node, **kwargs) -> Term:
name = self.env.add_tmp(node.s)
return self.term_type(name, self.env)
def visit_List(self, node, **kwargs) -> Term:
name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts])
return self.term_type(name, self.env)
visit_Tuple = visit_List
def visit_Index(self, node, **kwargs):
"""df.index[4]"""
return self.visit(node.value)
def visit_Subscript(self, node, **kwargs) -> Term:
from pandas import eval as pd_eval
value = self.visit(node.value)
slobj = self.visit(node.slice)
result = pd_eval(
slobj, local_dict=self.env, engine=self.engine, parser=self.parser
)
try:
# a Term instance
v = value.value[result]
except AttributeError:
# an Op instance
lhs = pd_eval(
value, local_dict=self.env, engine=self.engine, parser=self.parser
)
v = lhs[result]
name = self.env.add_tmp(v)
return self.term_type(name, env=self.env)
def visit_Slice(self, node, **kwargs) -> slice:
"""df.index[slice(4,6)]"""
lower = node.lower
if lower is not None:
lower = self.visit(lower).value
upper = node.upper
if upper is not None:
upper = self.visit(upper).value
step = node.step
if step is not None:
step = self.visit(step).value
return slice(lower, upper, step)
def visit_Assign(self, node, **kwargs):
"""
support a single assignment node, like
c = a + b
set the assigner at the top level, must be a Name node which
might or might not exist in the resolvers
"""
if len(node.targets) != 1:
raise SyntaxError("can only assign a single expression")
if not isinstance(node.targets[0], ast.Name):
raise SyntaxError("left hand side of an assignment must be a single name")
if self.env.target is None:
raise ValueError("cannot assign without a target object")
try:
assigner = self.visit(node.targets[0], **kwargs)
except UndefinedVariableError:
assigner = node.targets[0].id
self.assigner = getattr(assigner, "name", assigner)
if self.assigner is None:
raise SyntaxError(
"left hand side of an assignment must be a single resolvable name"
)
return self.visit(node.value, **kwargs)
def visit_Attribute(self, node, **kwargs):
attr = node.attr
value = node.value
ctx = node.ctx
if isinstance(ctx, ast.Load):
# resolve the value
resolved = self.visit(value).value
try:
v = getattr(resolved, attr)
name = self.env.add_tmp(v)
return self.term_type(name, self.env)
except AttributeError:
# something like datetime.datetime where scope is overridden
if isinstance(value, ast.Name) and value.id == attr:
return resolved
raise
raise ValueError(f"Invalid Attribute context {type(ctx).__name__}")
def visit_Call(self, node, side=None, **kwargs):
if isinstance(node.func, ast.Attribute) and node.func.attr != "__call__":
res = self.visit_Attribute(node.func)
elif not isinstance(node.func, ast.Name):
raise TypeError("Only named functions are supported")
else:
try:
res = self.visit(node.func)
except UndefinedVariableError:
# Check if this is a supported function name
try:
res = FuncNode(node.func.id)
except ValueError:
# Raise original error
raise
if res is None:
# error: "expr" has no attribute "id"
raise ValueError(
f"Invalid function call {node.func.id}" # type: ignore[attr-defined]
)
if hasattr(res, "value"):
res = res.value
if isinstance(res, FuncNode):
new_args = [self.visit(arg) for arg in node.args]
if node.keywords:
raise TypeError(
f'Function "{res.name}" does not support keyword arguments'
)
return res(*new_args)
else:
new_args = [self.visit(arg)(self.env) for arg in node.args]
for key in node.keywords:
if not isinstance(key, ast.keyword):
# error: "expr" has no attribute "id"
raise ValueError(
"keyword error in function call "
f"'{node.func.id}'" # type: ignore[attr-defined]
)
if key.arg:
kwargs[key.arg] = self.visit(key.value)(self.env)
name = self.env.add_tmp(res(*new_args, **kwargs))
return self.term_type(name=name, env=self.env)
def translate_In(self, op):
return op
def visit_Compare(self, node, **kwargs):
ops = node.ops
comps = node.comparators
# base case: we have something like a CMP b
if len(comps) == 1:
op = self.translate_In(ops[0])
binop = ast.BinOp(op=op, left=node.left, right=comps[0])
return self.visit(binop)
# recursive case: we have a chained comparison, a CMP b CMP c, etc.
left = node.left
values = []
for op, comp in zip(ops, comps):
new_node = self.visit(
ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)])
)
left = comp
values.append(new_node)
return self.visit(ast.BoolOp(op=ast.And(), values=values))
def _try_visit_binop(self, bop):
if isinstance(bop, (Op, Term)):
return bop
return self.visit(bop)
def visit_BoolOp(self, node, **kwargs):
def visitor(x, y):
lhs = self._try_visit_binop(x)
rhs = self._try_visit_binop(y)
op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs)
return self._maybe_evaluate_binop(op, node.op, lhs, rhs)
operands = node.values
return reduce(visitor, operands)
_python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"])
_numexpr_supported_calls = frozenset(REDUCTIONS + MATHOPS)
@disallow(
(_unsupported_nodes | _python_not_supported)
- (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"]))
)
class PandasExprVisitor(BaseExprVisitor):
def __init__(
self,
env,
engine,
parser,
preparser=partial(
_preparse,
f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks),
),
) -> None:
super().__init__(env, engine, parser, preparser)
@disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"]))
class PythonExprVisitor(BaseExprVisitor):
def __init__(
self, env, engine, parser, preparser=lambda source, f=None: source
) -> None:
super().__init__(env, engine, parser, preparser=preparser)
class Expr:
"""
Object encapsulating an expression.
Parameters
----------
expr : str
engine : str, optional, default 'numexpr'
parser : str, optional, default 'pandas'
env : Scope, optional, default None
level : int, optional, default 2
"""
env: Scope
engine: str
parser: str
def __init__(
self,
expr,
engine: str = "numexpr",
parser: str = "pandas",
env: Scope | None = None,
level: int = 0,
) -> None:
self.expr = expr
self.env = env or Scope(level=level + 1)
self.engine = engine
self.parser = parser
self._visitor = PARSERS[parser](self.env, self.engine, self.parser)
self.terms = self.parse()
@property
def assigner(self):
return getattr(self._visitor, "assigner", None)
def __call__(self):
return self.terms(self.env)
def __repr__(self) -> str:
return printing.pprint_thing(self.terms)
def __len__(self) -> int:
return len(self.expr)
def parse(self):
"""
Parse an expression.
"""
return self._visitor.visit(self.expr)
@property
def names(self):
"""
Get the names in an expression.
"""
if is_term(self.terms):
return frozenset([self.terms.name])
return frozenset(term.name for term in com.flatten(self.terms))
PARSERS = {"python": PythonExprVisitor, "pandas": PandasExprVisitor}

View File

@ -0,0 +1,286 @@
"""
Expressions
-----------
Offer fast expression evaluation through numexpr
"""
from __future__ import annotations
import operator
from typing import TYPE_CHECKING
import warnings
import numpy as np
from pandas._config import get_option
from pandas.util._exceptions import find_stack_level
from pandas.core import roperator
from pandas.core.computation.check import NUMEXPR_INSTALLED
if NUMEXPR_INSTALLED:
import numexpr as ne
if TYPE_CHECKING:
from pandas._typing import FuncType
_TEST_MODE: bool | None = None
_TEST_RESULT: list[bool] = []
USE_NUMEXPR = NUMEXPR_INSTALLED
_evaluate: FuncType | None = None
_where: FuncType | None = None
# the set of dtypes that we will allow pass to numexpr
_ALLOWED_DTYPES = {
"evaluate": {"int64", "int32", "float64", "float32", "bool"},
"where": {"int64", "float64", "bool"},
}
# the minimum prod shape that we will use numexpr
_MIN_ELEMENTS = 1_000_000
def set_use_numexpr(v: bool = True) -> None:
# set/unset to use numexpr
global USE_NUMEXPR
if NUMEXPR_INSTALLED:
USE_NUMEXPR = v
# choose what we are going to do
global _evaluate, _where
_evaluate = _evaluate_numexpr if USE_NUMEXPR else _evaluate_standard
_where = _where_numexpr if USE_NUMEXPR else _where_standard
def set_numexpr_threads(n=None) -> None:
# if we are using numexpr, set the threads to n
# otherwise reset
if NUMEXPR_INSTALLED and USE_NUMEXPR:
if n is None:
n = ne.detect_number_of_cores()
ne.set_num_threads(n)
def _evaluate_standard(op, op_str, a, b):
"""
Standard evaluation.
"""
if _TEST_MODE:
_store_test_result(False)
return op(a, b)
def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool:
"""return a boolean if we WILL be using numexpr"""
if op_str is not None:
# required min elements (otherwise we are adding overhead)
if a.size > _MIN_ELEMENTS:
# check for dtype compatibility
dtypes: set[str] = set()
for o in [a, b]:
# ndarray and Series Case
if hasattr(o, "dtype"):
dtypes |= {o.dtype.name}
# allowed are a superset
if not len(dtypes) or _ALLOWED_DTYPES[dtype_check] >= dtypes:
return True
return False
def _evaluate_numexpr(op, op_str, a, b):
result = None
if _can_use_numexpr(op, op_str, a, b, "evaluate"):
is_reversed = op.__name__.strip("_").startswith("r")
if is_reversed:
# we were originally called by a reversed op method
a, b = b, a
a_value = a
b_value = b
try:
result = ne.evaluate(
f"a_value {op_str} b_value",
local_dict={"a_value": a_value, "b_value": b_value},
casting="safe",
)
except TypeError:
# numexpr raises eg for array ** array with integers
# (https://github.com/pydata/numexpr/issues/379)
pass
except NotImplementedError:
if _bool_arith_fallback(op_str, a, b):
pass
else:
raise
if is_reversed:
# reverse order to original for fallback
a, b = b, a
if _TEST_MODE:
_store_test_result(result is not None)
if result is None:
result = _evaluate_standard(op, op_str, a, b)
return result
_op_str_mapping = {
operator.add: "+",
roperator.radd: "+",
operator.mul: "*",
roperator.rmul: "*",
operator.sub: "-",
roperator.rsub: "-",
operator.truediv: "/",
roperator.rtruediv: "/",
# floordiv not supported by numexpr 2.x
operator.floordiv: None,
roperator.rfloordiv: None,
# we require Python semantics for mod of negative for backwards compatibility
# see https://github.com/pydata/numexpr/issues/365
# so sticking with unaccelerated for now GH#36552
operator.mod: None,
roperator.rmod: None,
operator.pow: "**",
roperator.rpow: "**",
operator.eq: "==",
operator.ne: "!=",
operator.le: "<=",
operator.lt: "<",
operator.ge: ">=",
operator.gt: ">",
operator.and_: "&",
roperator.rand_: "&",
operator.or_: "|",
roperator.ror_: "|",
operator.xor: "^",
roperator.rxor: "^",
divmod: None,
roperator.rdivmod: None,
}
def _where_standard(cond, a, b):
# Caller is responsible for extracting ndarray if necessary
return np.where(cond, a, b)
def _where_numexpr(cond, a, b):
# Caller is responsible for extracting ndarray if necessary
result = None
if _can_use_numexpr(None, "where", a, b, "where"):
result = ne.evaluate(
"where(cond_value, a_value, b_value)",
local_dict={"cond_value": cond, "a_value": a, "b_value": b},
casting="safe",
)
if result is None:
result = _where_standard(cond, a, b)
return result
# turn myself on
set_use_numexpr(get_option("compute.use_numexpr"))
def _has_bool_dtype(x):
try:
return x.dtype == bool
except AttributeError:
return isinstance(x, (bool, np.bool_))
_BOOL_OP_UNSUPPORTED = {"+": "|", "*": "&", "-": "^"}
def _bool_arith_fallback(op_str, a, b) -> bool:
"""
Check if we should fallback to the python `_evaluate_standard` in case
of an unsupported operation by numexpr, which is the case for some
boolean ops.
"""
if _has_bool_dtype(a) and _has_bool_dtype(b):
if op_str in _BOOL_OP_UNSUPPORTED:
warnings.warn(
f"evaluating in Python space because the {repr(op_str)} "
"operator is not supported by numexpr for the bool dtype, "
f"use {repr(_BOOL_OP_UNSUPPORTED[op_str])} instead.",
stacklevel=find_stack_level(),
)
return True
return False
def evaluate(op, a, b, use_numexpr: bool = True):
"""
Evaluate and return the expression of the op on a and b.
Parameters
----------
op : the actual operand
a : left operand
b : right operand
use_numexpr : bool, default True
Whether to try to use numexpr.
"""
op_str = _op_str_mapping[op]
if op_str is not None:
if use_numexpr:
# error: "None" not callable
return _evaluate(op, op_str, a, b) # type: ignore[misc]
return _evaluate_standard(op, op_str, a, b)
def where(cond, a, b, use_numexpr: bool = True):
"""
Evaluate the where condition cond on a and b.
Parameters
----------
cond : np.ndarray[bool]
a : return if cond is True
b : return if cond is False
use_numexpr : bool, default True
Whether to try to use numexpr.
"""
assert _where is not None
return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b)
def set_test_mode(v: bool = True) -> None:
"""
Keeps track of whether numexpr was used.
Stores an additional ``True`` for every successful use of evaluate with
numexpr since the last ``get_test_result``.
"""
global _TEST_MODE, _TEST_RESULT
_TEST_MODE = v
_TEST_RESULT = []
def _store_test_result(used_numexpr: bool) -> None:
if used_numexpr:
_TEST_RESULT.append(used_numexpr)
def get_test_result() -> list[bool]:
"""
Get test result and reset test_results.
"""
global _TEST_RESULT
res = _TEST_RESULT
_TEST_RESULT = []
return res

View File

@ -0,0 +1,572 @@
"""
Operator classes for eval.
"""
from __future__ import annotations
from datetime import datetime
from functools import partial
import operator
from typing import (
TYPE_CHECKING,
Callable,
Literal,
)
import numpy as np
from pandas._libs.tslibs import Timestamp
from pandas.core.dtypes.common import (
is_list_like,
is_scalar,
)
import pandas.core.common as com
from pandas.core.computation.common import (
ensure_decoded,
result_type_many,
)
from pandas.core.computation.scope import DEFAULT_GLOBALS
from pandas.io.formats.printing import (
pprint_thing,
pprint_thing_encoded,
)
if TYPE_CHECKING:
from collections.abc import (
Iterable,
Iterator,
)
REDUCTIONS = ("sum", "prod", "min", "max")
_unary_math_ops = (
"sin",
"cos",
"exp",
"log",
"expm1",
"log1p",
"sqrt",
"sinh",
"cosh",
"tanh",
"arcsin",
"arccos",
"arctan",
"arccosh",
"arcsinh",
"arctanh",
"abs",
"log10",
"floor",
"ceil",
)
_binary_math_ops = ("arctan2",)
MATHOPS = _unary_math_ops + _binary_math_ops
LOCAL_TAG = "__pd_eval_local_"
class Term:
def __new__(cls, name, env, side=None, encoding=None):
klass = Constant if not isinstance(name, str) else cls
# error: Argument 2 for "super" not an instance of argument 1
supr_new = super(Term, klass).__new__ # type: ignore[misc]
return supr_new(klass)
is_local: bool
def __init__(self, name, env, side=None, encoding=None) -> None:
# name is a str for Term, but may be something else for subclasses
self._name = name
self.env = env
self.side = side
tname = str(name)
self.is_local = tname.startswith(LOCAL_TAG) or tname in DEFAULT_GLOBALS
self._value = self._resolve_name()
self.encoding = encoding
@property
def local_name(self) -> str:
return self.name.replace(LOCAL_TAG, "")
def __repr__(self) -> str:
return pprint_thing(self.name)
def __call__(self, *args, **kwargs):
return self.value
def evaluate(self, *args, **kwargs) -> Term:
return self
def _resolve_name(self):
local_name = str(self.local_name)
is_local = self.is_local
if local_name in self.env.scope and isinstance(
self.env.scope[local_name], type
):
is_local = False
res = self.env.resolve(local_name, is_local=is_local)
self.update(res)
if hasattr(res, "ndim") and res.ndim > 2:
raise NotImplementedError(
"N-dimensional objects, where N > 2, are not supported with eval"
)
return res
def update(self, value) -> None:
"""
search order for local (i.e., @variable) variables:
scope, key_variable
[('locals', 'local_name'),
('globals', 'local_name'),
('locals', 'key'),
('globals', 'key')]
"""
key = self.name
# if it's a variable name (otherwise a constant)
if isinstance(key, str):
self.env.swapkey(self.local_name, key, new_value=value)
self.value = value
@property
def is_scalar(self) -> bool:
return is_scalar(self._value)
@property
def type(self):
try:
# potentially very slow for large, mixed dtype frames
return self._value.values.dtype
except AttributeError:
try:
# ndarray
return self._value.dtype
except AttributeError:
# scalar
return type(self._value)
return_type = type
@property
def raw(self) -> str:
return f"{type(self).__name__}(name={repr(self.name)}, type={self.type})"
@property
def is_datetime(self) -> bool:
try:
t = self.type.type
except AttributeError:
t = self.type
return issubclass(t, (datetime, np.datetime64))
@property
def value(self):
return self._value
@value.setter
def value(self, new_value) -> None:
self._value = new_value
@property
def name(self):
return self._name
@property
def ndim(self) -> int:
return self._value.ndim
class Constant(Term):
def _resolve_name(self):
return self._name
@property
def name(self):
return self.value
def __repr__(self) -> str:
# in python 2 str() of float
# can truncate shorter than repr()
return repr(self.name)
_bool_op_map = {"not": "~", "and": "&", "or": "|"}
class Op:
"""
Hold an operator of arbitrary arity.
"""
op: str
def __init__(self, op: str, operands: Iterable[Term | Op], encoding=None) -> None:
self.op = _bool_op_map.get(op, op)
self.operands = operands
self.encoding = encoding
def __iter__(self) -> Iterator:
return iter(self.operands)
def __repr__(self) -> str:
"""
Print a generic n-ary operator and its operands using infix notation.
"""
# recurse over the operands
parened = (f"({pprint_thing(opr)})" for opr in self.operands)
return pprint_thing(f" {self.op} ".join(parened))
@property
def return_type(self):
# clobber types to bool if the op is a boolean operator
if self.op in (CMP_OPS_SYMS + BOOL_OPS_SYMS):
return np.bool_
return result_type_many(*(term.type for term in com.flatten(self)))
@property
def has_invalid_return_type(self) -> bool:
types = self.operand_types
obj_dtype_set = frozenset([np.dtype("object")])
return self.return_type == object and types - obj_dtype_set
@property
def operand_types(self):
return frozenset(term.type for term in com.flatten(self))
@property
def is_scalar(self) -> bool:
return all(operand.is_scalar for operand in self.operands)
@property
def is_datetime(self) -> bool:
try:
t = self.return_type.type
except AttributeError:
t = self.return_type
return issubclass(t, (datetime, np.datetime64))
def _in(x, y):
"""
Compute the vectorized membership of ``x in y`` if possible, otherwise
use Python.
"""
try:
return x.isin(y)
except AttributeError:
if is_list_like(x):
try:
return y.isin(x)
except AttributeError:
pass
return x in y
def _not_in(x, y):
"""
Compute the vectorized membership of ``x not in y`` if possible,
otherwise use Python.
"""
try:
return ~x.isin(y)
except AttributeError:
if is_list_like(x):
try:
return ~y.isin(x)
except AttributeError:
pass
return x not in y
CMP_OPS_SYMS = (">", "<", ">=", "<=", "==", "!=", "in", "not in")
_cmp_ops_funcs = (
operator.gt,
operator.lt,
operator.ge,
operator.le,
operator.eq,
operator.ne,
_in,
_not_in,
)
_cmp_ops_dict = dict(zip(CMP_OPS_SYMS, _cmp_ops_funcs))
BOOL_OPS_SYMS = ("&", "|", "and", "or")
_bool_ops_funcs = (operator.and_, operator.or_, operator.and_, operator.or_)
_bool_ops_dict = dict(zip(BOOL_OPS_SYMS, _bool_ops_funcs))
ARITH_OPS_SYMS = ("+", "-", "*", "/", "**", "//", "%")
_arith_ops_funcs = (
operator.add,
operator.sub,
operator.mul,
operator.truediv,
operator.pow,
operator.floordiv,
operator.mod,
)
_arith_ops_dict = dict(zip(ARITH_OPS_SYMS, _arith_ops_funcs))
SPECIAL_CASE_ARITH_OPS_SYMS = ("**", "//", "%")
_special_case_arith_ops_funcs = (operator.pow, operator.floordiv, operator.mod)
_special_case_arith_ops_dict = dict(
zip(SPECIAL_CASE_ARITH_OPS_SYMS, _special_case_arith_ops_funcs)
)
_binary_ops_dict = {}
for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict):
_binary_ops_dict.update(d)
def is_term(obj) -> bool:
return isinstance(obj, Term)
class BinOp(Op):
"""
Hold a binary operator and its operands.
Parameters
----------
op : str
lhs : Term or Op
rhs : Term or Op
"""
def __init__(self, op: str, lhs, rhs) -> None:
super().__init__(op, (lhs, rhs))
self.lhs = lhs
self.rhs = rhs
self._disallow_scalar_only_bool_ops()
self.convert_values()
try:
self.func = _binary_ops_dict[op]
except KeyError as err:
# has to be made a list for python3
keys = list(_binary_ops_dict.keys())
raise ValueError(
f"Invalid binary operator {repr(op)}, valid operators are {keys}"
) from err
def __call__(self, env):
"""
Recursively evaluate an expression in Python space.
Parameters
----------
env : Scope
Returns
-------
object
The result of an evaluated expression.
"""
# recurse over the left/right nodes
left = self.lhs(env)
right = self.rhs(env)
return self.func(left, right)
def evaluate(self, env, engine: str, parser, term_type, eval_in_python):
"""
Evaluate a binary operation *before* being passed to the engine.
Parameters
----------
env : Scope
engine : str
parser : str
term_type : type
eval_in_python : list
Returns
-------
term_type
The "pre-evaluated" expression as an instance of ``term_type``
"""
if engine == "python":
res = self(env)
else:
# recurse over the left/right nodes
left = self.lhs.evaluate(
env,
engine=engine,
parser=parser,
term_type=term_type,
eval_in_python=eval_in_python,
)
right = self.rhs.evaluate(
env,
engine=engine,
parser=parser,
term_type=term_type,
eval_in_python=eval_in_python,
)
# base cases
if self.op in eval_in_python:
res = self.func(left.value, right.value)
else:
from pandas.core.computation.eval import eval
res = eval(self, local_dict=env, engine=engine, parser=parser)
name = env.add_tmp(res)
return term_type(name, env=env)
def convert_values(self) -> None:
"""
Convert datetimes to a comparable value in an expression.
"""
def stringify(value):
encoder: Callable
if self.encoding is not None:
encoder = partial(pprint_thing_encoded, encoding=self.encoding)
else:
encoder = pprint_thing
return encoder(value)
lhs, rhs = self.lhs, self.rhs
if is_term(lhs) and lhs.is_datetime and is_term(rhs) and rhs.is_scalar:
v = rhs.value
if isinstance(v, (int, float)):
v = stringify(v)
v = Timestamp(ensure_decoded(v))
if v.tz is not None:
v = v.tz_convert("UTC")
self.rhs.update(v)
if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.is_scalar:
v = lhs.value
if isinstance(v, (int, float)):
v = stringify(v)
v = Timestamp(ensure_decoded(v))
if v.tz is not None:
v = v.tz_convert("UTC")
self.lhs.update(v)
def _disallow_scalar_only_bool_ops(self):
rhs = self.rhs
lhs = self.lhs
# GH#24883 unwrap dtype if necessary to ensure we have a type object
rhs_rt = rhs.return_type
rhs_rt = getattr(rhs_rt, "type", rhs_rt)
lhs_rt = lhs.return_type
lhs_rt = getattr(lhs_rt, "type", lhs_rt)
if (
(lhs.is_scalar or rhs.is_scalar)
and self.op in _bool_ops_dict
and (
not (
issubclass(rhs_rt, (bool, np.bool_))
and issubclass(lhs_rt, (bool, np.bool_))
)
)
):
raise NotImplementedError("cannot evaluate scalar only bool ops")
def isnumeric(dtype) -> bool:
return issubclass(np.dtype(dtype).type, np.number)
UNARY_OPS_SYMS = ("+", "-", "~", "not")
_unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert)
_unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs))
class UnaryOp(Op):
"""
Hold a unary operator and its operands.
Parameters
----------
op : str
The token used to represent the operator.
operand : Term or Op
The Term or Op operand to the operator.
Raises
------
ValueError
* If no function associated with the passed operator token is found.
"""
def __init__(self, op: Literal["+", "-", "~", "not"], operand) -> None:
super().__init__(op, (operand,))
self.operand = operand
try:
self.func = _unary_ops_dict[op]
except KeyError as err:
raise ValueError(
f"Invalid unary operator {repr(op)}, "
f"valid operators are {UNARY_OPS_SYMS}"
) from err
def __call__(self, env) -> MathCall:
operand = self.operand(env)
# error: Cannot call function of unknown type
return self.func(operand) # type: ignore[operator]
def __repr__(self) -> str:
return pprint_thing(f"{self.op}({self.operand})")
@property
def return_type(self) -> np.dtype:
operand = self.operand
if operand.return_type == np.dtype("bool"):
return np.dtype("bool")
if isinstance(operand, Op) and (
operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict
):
return np.dtype("bool")
return np.dtype("int")
class MathCall(Op):
def __init__(self, func, args) -> None:
super().__init__(func.name, args)
self.func = func
def __call__(self, env):
# error: "Op" not callable
operands = [op(env) for op in self.operands] # type: ignore[operator]
return self.func.func(*operands)
def __repr__(self) -> str:
operands = map(str, self.operands)
return pprint_thing(f"{self.op}({','.join(operands)})")
class FuncNode:
def __init__(self, name: str) -> None:
if name not in MATHOPS:
raise ValueError(f'"{name}" is not a supported function')
self.name = name
self.func = getattr(np, name)
def __call__(self, *args) -> MathCall:
return MathCall(self, args)

View File

@ -0,0 +1,198 @@
"""
:func:`~pandas.eval` source string parsing functions
"""
from __future__ import annotations
from io import StringIO
from keyword import iskeyword
import token
import tokenize
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Iterator,
)
# A token value Python's tokenizer probably will never use.
BACKTICK_QUOTED_STRING = 100
def create_valid_python_identifier(name: str) -> str:
"""
Create valid Python identifiers from any string.
Check if name contains any special characters. If it contains any
special characters, the special characters will be replaced by
a special string and a prefix is added.
Raises
------
SyntaxError
If the returned name is not a Python valid identifier, raise an exception.
This can happen if there is a hashtag in the name, as the tokenizer will
than terminate and not find the backtick.
But also for characters that fall out of the range of (U+0001..U+007F).
"""
if name.isidentifier() and not iskeyword(name):
return name
# Create a dict with the special characters and their replacement string.
# EXACT_TOKEN_TYPES contains these special characters
# token.tok_name contains a readable description of the replacement string.
special_characters_replacements = {
char: f"_{token.tok_name[tokval]}_"
for char, tokval in (tokenize.EXACT_TOKEN_TYPES.items())
}
special_characters_replacements.update(
{
" ": "_",
"?": "_QUESTIONMARK_",
"!": "_EXCLAMATIONMARK_",
"$": "_DOLLARSIGN_",
"": "_EUROSIGN_",
"°": "_DEGREESIGN_",
# Including quotes works, but there are exceptions.
"'": "_SINGLEQUOTE_",
'"': "_DOUBLEQUOTE_",
# Currently not possible. Terminates parser and won't find backtick.
# "#": "_HASH_",
}
)
name = "".join([special_characters_replacements.get(char, char) for char in name])
name = f"BACKTICK_QUOTED_STRING_{name}"
if not name.isidentifier():
raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.")
return name
def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]:
"""
Clean up a column name if surrounded by backticks.
Backtick quoted string are indicated by a certain tokval value. If a string
is a backtick quoted token it will processed by
:func:`_create_valid_python_identifier` so that the parser can find this
string when the query is executed.
In this case the tok will get the NAME tokval.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
tok : Tuple[int, str]
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == BACKTICK_QUOTED_STRING:
return tokenize.NAME, create_valid_python_identifier(tokval)
return toknum, tokval
def clean_column_name(name: Hashable) -> Hashable:
"""
Function to emulate the cleaning of a backtick quoted name.
The purpose for this function is to see what happens to the name of
identifier if it goes to the process of being parsed a Python code
inside a backtick quoted string and than being cleaned
(removed of any special characters).
Parameters
----------
name : hashable
Name to be cleaned.
Returns
-------
name : hashable
Returns the name after tokenizing and cleaning.
Notes
-----
For some cases, a name cannot be converted to a valid Python identifier.
In that case :func:`tokenize_string` raises a SyntaxError.
In that case, we just return the name unmodified.
If this name was used in the query string (this makes the query call impossible)
an error will be raised by :func:`tokenize_backtick_quoted_string` instead,
which is not caught and propagates to the user level.
"""
try:
tokenized = tokenize_string(f"`{name}`")
tokval = next(tokenized)[1]
return create_valid_python_identifier(tokval)
except SyntaxError:
return name
def tokenize_backtick_quoted_string(
token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int
) -> tuple[int, str]:
"""
Creates a token from a backtick quoted string.
Moves the token_generator forwards till right after the next backtick.
Parameters
----------
token_generator : Iterator[tokenize.TokenInfo]
The generator that yields the tokens of the source string (Tuple[int, str]).
The generator is at the first token after the backtick (`)
source : str
The Python source code string.
string_start : int
This is the start of backtick quoted string inside the source string.
Returns
-------
tok: Tuple[int, str]
The token that represents the backtick quoted string.
The integer is equal to BACKTICK_QUOTED_STRING (100).
"""
for _, tokval, start, _, _ in token_generator:
if tokval == "`":
string_end = start[1]
break
return BACKTICK_QUOTED_STRING, source[string_start:string_end]
def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
"""
Tokenize a Python source code string.
Parameters
----------
source : str
The Python source code string.
Returns
-------
tok_generator : Iterator[Tuple[int, str]]
An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
"""
line_reader = StringIO(source).readline
token_generator = tokenize.generate_tokens(line_reader)
# Loop over all tokens till a backtick (`) is found.
# Then, take all tokens till the next backtick to form a backtick quoted string
for toknum, tokval, start, _, _ in token_generator:
if tokval == "`":
try:
yield tokenize_backtick_quoted_string(
token_generator, source, string_start=start[1] + 1
)
except Exception as err:
raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err
else:
yield toknum, tokval

View File

@ -0,0 +1,666 @@
""" manage PyTables query interface via Expressions """
from __future__ import annotations
import ast
from decimal import (
Decimal,
InvalidOperation,
)
from functools import partial
from typing import (
TYPE_CHECKING,
Any,
ClassVar,
)
import numpy as np
from pandas._libs.tslibs import (
Timedelta,
Timestamp,
)
from pandas.errors import UndefinedVariableError
from pandas.core.dtypes.common import is_list_like
import pandas.core.common as com
from pandas.core.computation import (
expr,
ops,
scope as _scope,
)
from pandas.core.computation.common import ensure_decoded
from pandas.core.computation.expr import BaseExprVisitor
from pandas.core.computation.ops import is_term
from pandas.core.construction import extract_array
from pandas.core.indexes.base import Index
from pandas.io.formats.printing import (
pprint_thing,
pprint_thing_encoded,
)
if TYPE_CHECKING:
from pandas._typing import (
Self,
npt,
)
class PyTablesScope(_scope.Scope):
__slots__ = ("queryables",)
queryables: dict[str, Any]
def __init__(
self,
level: int,
global_dict=None,
local_dict=None,
queryables: dict[str, Any] | None = None,
) -> None:
super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict)
self.queryables = queryables or {}
class Term(ops.Term):
env: PyTablesScope
def __new__(cls, name, env, side=None, encoding=None):
if isinstance(name, str):
klass = cls
else:
klass = Constant
return object.__new__(klass)
def __init__(self, name, env: PyTablesScope, side=None, encoding=None) -> None:
super().__init__(name, env, side=side, encoding=encoding)
def _resolve_name(self):
# must be a queryables
if self.side == "left":
# Note: The behavior of __new__ ensures that self.name is a str here
if self.name not in self.env.queryables:
raise NameError(f"name {repr(self.name)} is not defined")
return self.name
# resolve the rhs (and allow it to be None)
try:
return self.env.resolve(self.name, is_local=False)
except UndefinedVariableError:
return self.name
# read-only property overwriting read/write property
@property # type: ignore[misc]
def value(self):
return self._value
class Constant(Term):
def __init__(self, name, env: PyTablesScope, side=None, encoding=None) -> None:
assert isinstance(env, PyTablesScope), type(env)
super().__init__(name, env, side=side, encoding=encoding)
def _resolve_name(self):
return self._name
class BinOp(ops.BinOp):
_max_selectors = 31
op: str
queryables: dict[str, Any]
condition: str | None
def __init__(self, op: str, lhs, rhs, queryables: dict[str, Any], encoding) -> None:
super().__init__(op, lhs, rhs)
self.queryables = queryables
self.encoding = encoding
self.condition = None
def _disallow_scalar_only_bool_ops(self) -> None:
pass
def prune(self, klass):
def pr(left, right):
"""create and return a new specialized BinOp from myself"""
if left is None:
return right
elif right is None:
return left
k = klass
if isinstance(left, ConditionBinOp):
if isinstance(right, ConditionBinOp):
k = JointConditionBinOp
elif isinstance(left, k):
return left
elif isinstance(right, k):
return right
elif isinstance(left, FilterBinOp):
if isinstance(right, FilterBinOp):
k = JointFilterBinOp
elif isinstance(left, k):
return left
elif isinstance(right, k):
return right
return k(
self.op, left, right, queryables=self.queryables, encoding=self.encoding
).evaluate()
left, right = self.lhs, self.rhs
if is_term(left) and is_term(right):
res = pr(left.value, right.value)
elif not is_term(left) and is_term(right):
res = pr(left.prune(klass), right.value)
elif is_term(left) and not is_term(right):
res = pr(left.value, right.prune(klass))
elif not (is_term(left) or is_term(right)):
res = pr(left.prune(klass), right.prune(klass))
return res
def conform(self, rhs):
"""inplace conform rhs"""
if not is_list_like(rhs):
rhs = [rhs]
if isinstance(rhs, np.ndarray):
rhs = rhs.ravel()
return rhs
@property
def is_valid(self) -> bool:
"""return True if this is a valid field"""
return self.lhs in self.queryables
@property
def is_in_table(self) -> bool:
"""
return True if this is a valid column name for generation (e.g. an
actual column in the table)
"""
return self.queryables.get(self.lhs) is not None
@property
def kind(self):
"""the kind of my field"""
return getattr(self.queryables.get(self.lhs), "kind", None)
@property
def meta(self):
"""the meta of my field"""
return getattr(self.queryables.get(self.lhs), "meta", None)
@property
def metadata(self):
"""the metadata of my field"""
return getattr(self.queryables.get(self.lhs), "metadata", None)
def generate(self, v) -> str:
"""create and return the op string for this TermValue"""
val = v.tostring(self.encoding)
return f"({self.lhs} {self.op} {val})"
def convert_value(self, v) -> TermValue:
"""
convert the expression that is in the term to something that is
accepted by pytables
"""
def stringify(value):
if self.encoding is not None:
return pprint_thing_encoded(value, encoding=self.encoding)
return pprint_thing(value)
kind = ensure_decoded(self.kind)
meta = ensure_decoded(self.meta)
if kind == "datetime" or (kind and kind.startswith("datetime64")):
if isinstance(v, (int, float)):
v = stringify(v)
v = ensure_decoded(v)
v = Timestamp(v).as_unit("ns")
if v.tz is not None:
v = v.tz_convert("UTC")
return TermValue(v, v._value, kind)
elif kind in ("timedelta64", "timedelta"):
if isinstance(v, str):
v = Timedelta(v)
else:
v = Timedelta(v, unit="s")
v = v.as_unit("ns")._value
return TermValue(int(v), v, kind)
elif meta == "category":
metadata = extract_array(self.metadata, extract_numpy=True)
result: npt.NDArray[np.intp] | np.intp | int
if v not in metadata:
result = -1
else:
result = metadata.searchsorted(v, side="left")
return TermValue(result, result, "integer")
elif kind == "integer":
try:
v_dec = Decimal(v)
except InvalidOperation:
# GH 54186
# convert v to float to raise float's ValueError
float(v)
else:
v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN"))
return TermValue(v, v, kind)
elif kind == "float":
v = float(v)
return TermValue(v, v, kind)
elif kind == "bool":
if isinstance(v, str):
v = v.strip().lower() not in [
"false",
"f",
"no",
"n",
"none",
"0",
"[]",
"{}",
"",
]
else:
v = bool(v)
return TermValue(v, v, kind)
elif isinstance(v, str):
# string quoting
return TermValue(v, stringify(v), "string")
else:
raise TypeError(f"Cannot compare {v} of type {type(v)} to {kind} column")
def convert_values(self) -> None:
pass
class FilterBinOp(BinOp):
filter: tuple[Any, Any, Index] | None = None
def __repr__(self) -> str:
if self.filter is None:
return "Filter: Not Initialized"
return pprint_thing(f"[Filter : [{self.filter[0]}] -> [{self.filter[1]}]")
def invert(self) -> Self:
"""invert the filter"""
if self.filter is not None:
self.filter = (
self.filter[0],
self.generate_filter_op(invert=True),
self.filter[2],
)
return self
def format(self):
"""return the actual filter format"""
return [self.filter]
# error: Signature of "evaluate" incompatible with supertype "BinOp"
def evaluate(self) -> Self | None: # type: ignore[override]
if not self.is_valid:
raise ValueError(f"query term is not valid [{self}]")
rhs = self.conform(self.rhs)
values = list(rhs)
if self.is_in_table:
# if too many values to create the expression, use a filter instead
if self.op in ["==", "!="] and len(values) > self._max_selectors:
filter_op = self.generate_filter_op()
self.filter = (self.lhs, filter_op, Index(values))
return self
return None
# equality conditions
if self.op in ["==", "!="]:
filter_op = self.generate_filter_op()
self.filter = (self.lhs, filter_op, Index(values))
else:
raise TypeError(
f"passing a filterable condition to a non-table indexer [{self}]"
)
return self
def generate_filter_op(self, invert: bool = False):
if (self.op == "!=" and not invert) or (self.op == "==" and invert):
return lambda axis, vals: ~axis.isin(vals)
else:
return lambda axis, vals: axis.isin(vals)
class JointFilterBinOp(FilterBinOp):
def format(self):
raise NotImplementedError("unable to collapse Joint Filters")
# error: Signature of "evaluate" incompatible with supertype "BinOp"
def evaluate(self) -> Self: # type: ignore[override]
return self
class ConditionBinOp(BinOp):
def __repr__(self) -> str:
return pprint_thing(f"[Condition : [{self.condition}]]")
def invert(self):
"""invert the condition"""
# if self.condition is not None:
# self.condition = "~(%s)" % self.condition
# return self
raise NotImplementedError(
"cannot use an invert condition when passing to numexpr"
)
def format(self):
"""return the actual ne format"""
return self.condition
# error: Signature of "evaluate" incompatible with supertype "BinOp"
def evaluate(self) -> Self | None: # type: ignore[override]
if not self.is_valid:
raise ValueError(f"query term is not valid [{self}]")
# convert values if we are in the table
if not self.is_in_table:
return None
rhs = self.conform(self.rhs)
values = [self.convert_value(v) for v in rhs]
# equality conditions
if self.op in ["==", "!="]:
# too many values to create the expression?
if len(values) <= self._max_selectors:
vs = [self.generate(v) for v in values]
self.condition = f"({' | '.join(vs)})"
# use a filter after reading
else:
return None
else:
self.condition = self.generate(values[0])
return self
class JointConditionBinOp(ConditionBinOp):
# error: Signature of "evaluate" incompatible with supertype "BinOp"
def evaluate(self) -> Self: # type: ignore[override]
self.condition = f"({self.lhs.condition} {self.op} {self.rhs.condition})"
return self
class UnaryOp(ops.UnaryOp):
def prune(self, klass):
if self.op != "~":
raise NotImplementedError("UnaryOp only support invert type ops")
operand = self.operand
operand = operand.prune(klass)
if operand is not None and (
issubclass(klass, ConditionBinOp)
and operand.condition is not None
or not issubclass(klass, ConditionBinOp)
and issubclass(klass, FilterBinOp)
and operand.filter is not None
):
return operand.invert()
return None
class PyTablesExprVisitor(BaseExprVisitor):
const_type: ClassVar[type[ops.Term]] = Constant
term_type: ClassVar[type[Term]] = Term
def __init__(self, env, engine, parser, **kwargs) -> None:
super().__init__(env, engine, parser)
for bin_op in self.binary_ops:
bin_node = self.binary_op_nodes_map[bin_op]
setattr(
self,
f"visit_{bin_node}",
lambda node, bin_op=bin_op: partial(BinOp, bin_op, **kwargs),
)
def visit_UnaryOp(self, node, **kwargs) -> ops.Term | UnaryOp | None:
if isinstance(node.op, (ast.Not, ast.Invert)):
return UnaryOp("~", self.visit(node.operand))
elif isinstance(node.op, ast.USub):
return self.const_type(-self.visit(node.operand).value, self.env)
elif isinstance(node.op, ast.UAdd):
raise NotImplementedError("Unary addition not supported")
# TODO: return None might never be reached
return None
def visit_Index(self, node, **kwargs):
return self.visit(node.value).value
def visit_Assign(self, node, **kwargs):
cmpr = ast.Compare(
ops=[ast.Eq()], left=node.targets[0], comparators=[node.value]
)
return self.visit(cmpr)
def visit_Subscript(self, node, **kwargs) -> ops.Term:
# only allow simple subscripts
value = self.visit(node.value)
slobj = self.visit(node.slice)
try:
value = value.value
except AttributeError:
pass
if isinstance(slobj, Term):
# In py39 np.ndarray lookups with Term containing int raise
slobj = slobj.value
try:
return self.const_type(value[slobj], self.env)
except TypeError as err:
raise ValueError(
f"cannot subscript {repr(value)} with {repr(slobj)}"
) from err
def visit_Attribute(self, node, **kwargs):
attr = node.attr
value = node.value
ctx = type(node.ctx)
if ctx == ast.Load:
# resolve the value
resolved = self.visit(value)
# try to get the value to see if we are another expression
try:
resolved = resolved.value
except AttributeError:
pass
try:
return self.term_type(getattr(resolved, attr), self.env)
except AttributeError:
# something like datetime.datetime where scope is overridden
if isinstance(value, ast.Name) and value.id == attr:
return resolved
raise ValueError(f"Invalid Attribute context {ctx.__name__}")
def translate_In(self, op):
return ast.Eq() if isinstance(op, ast.In) else op
def _rewrite_membership_op(self, node, left, right):
return self.visit(node.op), node.op, left, right
def _validate_where(w):
"""
Validate that the where statement is of the right type.
The type may either be String, Expr, or list-like of Exprs.
Parameters
----------
w : String term expression, Expr, or list-like of Exprs.
Returns
-------
where : The original where clause if the check was successful.
Raises
------
TypeError : An invalid data type was passed in for w (e.g. dict).
"""
if not (isinstance(w, (PyTablesExpr, str)) or is_list_like(w)):
raise TypeError(
"where must be passed as a string, PyTablesExpr, "
"or list-like of PyTablesExpr"
)
return w
class PyTablesExpr(expr.Expr):
"""
Hold a pytables-like expression, comprised of possibly multiple 'terms'.
Parameters
----------
where : string term expression, PyTablesExpr, or list-like of PyTablesExprs
queryables : a "kinds" map (dict of column name -> kind), or None if column
is non-indexable
encoding : an encoding that will encode the query terms
Returns
-------
a PyTablesExpr object
Examples
--------
'index>=date'
"columns=['A', 'D']"
'columns=A'
'columns==A'
"~(columns=['A','B'])"
'index>df.index[3] & string="bar"'
'(index>df.index[3] & index<=df.index[6]) | string="bar"'
"ts>=Timestamp('2012-02-01')"
"major_axis>=20130101"
"""
_visitor: PyTablesExprVisitor | None
env: PyTablesScope
expr: str
def __init__(
self,
where,
queryables: dict[str, Any] | None = None,
encoding=None,
scope_level: int = 0,
) -> None:
where = _validate_where(where)
self.encoding = encoding
self.condition = None
self.filter = None
self.terms = None
self._visitor = None
# capture the environment if needed
local_dict: _scope.DeepChainMap[Any, Any] | None = None
if isinstance(where, PyTablesExpr):
local_dict = where.env.scope
_where = where.expr
elif is_list_like(where):
where = list(where)
for idx, w in enumerate(where):
if isinstance(w, PyTablesExpr):
local_dict = w.env.scope
else:
where[idx] = _validate_where(w)
_where = " & ".join([f"({w})" for w in com.flatten(where)])
else:
# _validate_where ensures we otherwise have a string
_where = where
self.expr = _where
self.env = PyTablesScope(scope_level + 1, local_dict=local_dict)
if queryables is not None and isinstance(self.expr, str):
self.env.queryables.update(queryables)
self._visitor = PyTablesExprVisitor(
self.env,
queryables=queryables,
parser="pytables",
engine="pytables",
encoding=encoding,
)
self.terms = self.parse()
def __repr__(self) -> str:
if self.terms is not None:
return pprint_thing(self.terms)
return pprint_thing(self.expr)
def evaluate(self):
"""create and return the numexpr condition and filter"""
try:
self.condition = self.terms.prune(ConditionBinOp)
except AttributeError as err:
raise ValueError(
f"cannot process expression [{self.expr}], [{self}] "
"is not a valid condition"
) from err
try:
self.filter = self.terms.prune(FilterBinOp)
except AttributeError as err:
raise ValueError(
f"cannot process expression [{self.expr}], [{self}] "
"is not a valid filter"
) from err
return self.condition, self.filter
class TermValue:
"""hold a term value the we use to construct a condition/filter"""
def __init__(self, value, converted, kind: str) -> None:
assert isinstance(kind, str), kind
self.value = value
self.converted = converted
self.kind = kind
def tostring(self, encoding) -> str:
"""quote the string if not encoded else encode and return"""
if self.kind == "string":
if encoding is not None:
return str(self.converted)
return f'"{self.converted}"'
elif self.kind == "float":
# python 2 str(float) is not always
# round-trippable so use repr()
return repr(self.converted)
return str(self.converted)
def maybe_expression(s) -> bool:
"""loose checking if s is a pytables-acceptable expression"""
if not isinstance(s, str):
return False
operations = PyTablesExprVisitor.binary_ops + PyTablesExprVisitor.unary_ops + ("=",)
# make sure we have an op at least
return any(op in s for op in operations)

View File

@ -0,0 +1,355 @@
"""
Module for scope operations
"""
from __future__ import annotations
from collections import ChainMap
import datetime
import inspect
from io import StringIO
import itertools
import pprint
import struct
import sys
from typing import TypeVar
import numpy as np
from pandas._libs.tslibs import Timestamp
from pandas.errors import UndefinedVariableError
_KT = TypeVar("_KT")
_VT = TypeVar("_VT")
# https://docs.python.org/3/library/collections.html#chainmap-examples-and-recipes
class DeepChainMap(ChainMap[_KT, _VT]):
"""
Variant of ChainMap that allows direct updates to inner scopes.
Only works when all passed mapping are mutable.
"""
def __setitem__(self, key: _KT, value: _VT) -> None:
for mapping in self.maps:
if key in mapping:
mapping[key] = value
return
self.maps[0][key] = value
def __delitem__(self, key: _KT) -> None:
"""
Raises
------
KeyError
If `key` doesn't exist.
"""
for mapping in self.maps:
if key in mapping:
del mapping[key]
return
raise KeyError(key)
def ensure_scope(
level: int, global_dict=None, local_dict=None, resolvers=(), target=None
) -> Scope:
"""Ensure that we are grabbing the correct scope."""
return Scope(
level + 1,
global_dict=global_dict,
local_dict=local_dict,
resolvers=resolvers,
target=target,
)
def _replacer(x) -> str:
"""
Replace a number with its hexadecimal representation. Used to tag
temporary variables with their calling scope's id.
"""
# get the hex repr of the binary char and remove 0x and pad by pad_size
# zeros
try:
hexin = ord(x)
except TypeError:
# bytes literals masquerade as ints when iterating in py3
hexin = x
return hex(hexin)
def _raw_hex_id(obj) -> str:
"""Return the padded hexadecimal id of ``obj``."""
# interpret as a pointer since that's what really what id returns
packed = struct.pack("@P", id(obj))
return "".join([_replacer(x) for x in packed])
DEFAULT_GLOBALS = {
"Timestamp": Timestamp,
"datetime": datetime.datetime,
"True": True,
"False": False,
"list": list,
"tuple": tuple,
"inf": np.inf,
"Inf": np.inf,
}
def _get_pretty_string(obj) -> str:
"""
Return a prettier version of obj.
Parameters
----------
obj : object
Object to pretty print
Returns
-------
str
Pretty print object repr
"""
sio = StringIO()
pprint.pprint(obj, stream=sio)
return sio.getvalue()
class Scope:
"""
Object to hold scope, with a few bells to deal with some custom syntax
and contexts added by pandas.
Parameters
----------
level : int
global_dict : dict or None, optional, default None
local_dict : dict or Scope or None, optional, default None
resolvers : list-like or None, optional, default None
target : object
Attributes
----------
level : int
scope : DeepChainMap
target : object
temps : dict
"""
__slots__ = ["level", "scope", "target", "resolvers", "temps"]
level: int
scope: DeepChainMap
resolvers: DeepChainMap
temps: dict
def __init__(
self, level: int, global_dict=None, local_dict=None, resolvers=(), target=None
) -> None:
self.level = level + 1
# shallow copy because we don't want to keep filling this up with what
# was there before if there are multiple calls to Scope/_ensure_scope
self.scope = DeepChainMap(DEFAULT_GLOBALS.copy())
self.target = target
if isinstance(local_dict, Scope):
self.scope.update(local_dict.scope)
if local_dict.target is not None:
self.target = local_dict.target
self._update(local_dict.level)
frame = sys._getframe(self.level)
try:
# shallow copy here because we don't want to replace what's in
# scope when we align terms (alignment accesses the underlying
# numpy array of pandas objects)
scope_global = self.scope.new_child(
(global_dict if global_dict is not None else frame.f_globals).copy()
)
self.scope = DeepChainMap(scope_global)
if not isinstance(local_dict, Scope):
scope_local = self.scope.new_child(
(local_dict if local_dict is not None else frame.f_locals).copy()
)
self.scope = DeepChainMap(scope_local)
finally:
del frame
# assumes that resolvers are going from outermost scope to inner
if isinstance(local_dict, Scope):
resolvers += tuple(local_dict.resolvers.maps)
self.resolvers = DeepChainMap(*resolvers)
self.temps = {}
def __repr__(self) -> str:
scope_keys = _get_pretty_string(list(self.scope.keys()))
res_keys = _get_pretty_string(list(self.resolvers.keys()))
return f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})"
@property
def has_resolvers(self) -> bool:
"""
Return whether we have any extra scope.
For example, DataFrames pass Their columns as resolvers during calls to
``DataFrame.eval()`` and ``DataFrame.query()``.
Returns
-------
hr : bool
"""
return bool(len(self.resolvers))
def resolve(self, key: str, is_local: bool):
"""
Resolve a variable name in a possibly local context.
Parameters
----------
key : str
A variable name
is_local : bool
Flag indicating whether the variable is local or not (prefixed with
the '@' symbol)
Returns
-------
value : object
The value of a particular variable
"""
try:
# only look for locals in outer scope
if is_local:
return self.scope[key]
# not a local variable so check in resolvers if we have them
if self.has_resolvers:
return self.resolvers[key]
# if we're here that means that we have no locals and we also have
# no resolvers
assert not is_local and not self.has_resolvers
return self.scope[key]
except KeyError:
try:
# last ditch effort we look in temporaries
# these are created when parsing indexing expressions
# e.g., df[df > 0]
return self.temps[key]
except KeyError as err:
raise UndefinedVariableError(key, is_local) from err
def swapkey(self, old_key: str, new_key: str, new_value=None) -> None:
"""
Replace a variable name, with a potentially new value.
Parameters
----------
old_key : str
Current variable name to replace
new_key : str
New variable name to replace `old_key` with
new_value : object
Value to be replaced along with the possible renaming
"""
if self.has_resolvers:
maps = self.resolvers.maps + self.scope.maps
else:
maps = self.scope.maps
maps.append(self.temps)
for mapping in maps:
if old_key in mapping:
mapping[new_key] = new_value
return
def _get_vars(self, stack, scopes: list[str]) -> None:
"""
Get specifically scoped variables from a list of stack frames.
Parameters
----------
stack : list
A list of stack frames as returned by ``inspect.stack()``
scopes : sequence of strings
A sequence containing valid stack frame attribute names that
evaluate to a dictionary. For example, ('locals', 'globals')
"""
variables = itertools.product(scopes, stack)
for scope, (frame, _, _, _, _, _) in variables:
try:
d = getattr(frame, f"f_{scope}")
self.scope = DeepChainMap(self.scope.new_child(d))
finally:
# won't remove it, but DECREF it
# in Py3 this probably isn't necessary since frame won't be
# scope after the loop
del frame
def _update(self, level: int) -> None:
"""
Update the current scope by going back `level` levels.
Parameters
----------
level : int
"""
sl = level + 1
# add sl frames to the scope starting with the
# most distant and overwriting with more current
# makes sure that we can capture variable scope
stack = inspect.stack()
try:
self._get_vars(stack[:sl], scopes=["locals"])
finally:
del stack[:], stack
def add_tmp(self, value) -> str:
"""
Add a temporary variable to the scope.
Parameters
----------
value : object
An arbitrary object to be assigned to a temporary variable.
Returns
-------
str
The name of the temporary variable created.
"""
name = f"{type(value).__name__}_{self.ntemps}_{_raw_hex_id(self)}"
# add to inner most scope
assert name not in self.temps
self.temps[name] = value
assert name in self.temps
# only increment if the variable gets put in the scope
return name
@property
def ntemps(self) -> int:
"""The number of temporary variables in this scope"""
return len(self.temps)
@property
def full_scope(self) -> DeepChainMap:
"""
Return the full scope for use with passing to engines transparently
as a mapping.
Returns
-------
vars : DeepChainMap
All variables in this scope.
"""
maps = [self.temps] + self.resolvers.maps + self.scope.maps
return DeepChainMap(*maps)