Updated script that can be controled by Nodejs web app

This commit is contained in:
mac OS
2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,111 @@
import pytest
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
from pandas.core.reshape.merge import (
MergeError,
merge,
)
@pytest.mark.parametrize(
("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])]
)
def test_merge_cross(input_col, output_cols):
# GH#5401
left = DataFrame({"a": [1, 3]})
right = DataFrame({input_col: [3, 4]})
left_copy = left.copy()
right_copy = right.copy()
result = merge(left, right, how="cross")
expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]})
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(left, left_copy)
tm.assert_frame_equal(right, right_copy)
@pytest.mark.parametrize(
"kwargs",
[
{"left_index": True},
{"right_index": True},
{"on": "a"},
{"left_on": "a"},
{"right_on": "b"},
],
)
def test_merge_cross_error_reporting(kwargs):
# GH#5401
left = DataFrame({"a": [1, 3]})
right = DataFrame({"b": [3, 4]})
msg = (
"Can not pass on, right_on, left_on or set right_index=True or "
"left_index=True"
)
with pytest.raises(MergeError, match=msg):
merge(left, right, how="cross", **kwargs)
def test_merge_cross_mixed_dtypes():
# GH#5401
left = DataFrame(["a", "b", "c"], columns=["A"])
right = DataFrame(range(2), columns=["B"])
result = merge(left, right, how="cross")
expected = DataFrame({"A": ["a", "a", "b", "b", "c", "c"], "B": [0, 1, 0, 1, 0, 1]})
tm.assert_frame_equal(result, expected)
def test_merge_cross_more_than_one_column():
# GH#5401
left = DataFrame({"A": list("ab"), "B": [2, 1]})
right = DataFrame({"C": range(2), "D": range(4, 6)})
result = merge(left, right, how="cross")
expected = DataFrame(
{
"A": ["a", "a", "b", "b"],
"B": [2, 2, 1, 1],
"C": [0, 1, 0, 1],
"D": [4, 5, 4, 5],
}
)
tm.assert_frame_equal(result, expected)
def test_merge_cross_null_values(nulls_fixture):
# GH#5401
left = DataFrame({"a": [1, nulls_fixture]})
right = DataFrame({"b": ["a", "b"], "c": [1.0, 2.0]})
result = merge(left, right, how="cross")
expected = DataFrame(
{
"a": [1, 1, nulls_fixture, nulls_fixture],
"b": ["a", "b", "a", "b"],
"c": [1.0, 2.0, 1.0, 2.0],
}
)
tm.assert_frame_equal(result, expected)
def test_join_cross_error_reporting():
# GH#5401
left = DataFrame({"a": [1, 3]})
right = DataFrame({"a": [3, 4]})
msg = (
"Can not pass on, right_on, left_on or set right_index=True or "
"left_index=True"
)
with pytest.raises(MergeError, match=msg):
left.join(right, how="cross", on="a")
def test_merge_cross_series():
# GH#54055
ls = Series([1, 2, 3, 4], index=[1, 2, 3, 4], name="left")
rs = Series([3, 4, 5, 6], index=[3, 4, 5, 6], name="right")
res = merge(ls, rs, how="cross")
expected = merge(ls.to_frame(), rs.to_frame(), how="cross")
tm.assert_frame_equal(res, expected)

View File

@ -0,0 +1,186 @@
import numpy as np
import pytest
from pandas import DataFrame
import pandas._testing as tm
@pytest.fixture
def df1():
return DataFrame(
{
"outer": [1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4],
"inner": [1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2],
"v1": np.linspace(0, 1, 11),
}
)
@pytest.fixture
def df2():
return DataFrame(
{
"outer": [1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3],
"inner": [1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3],
"v2": np.linspace(10, 11, 12),
}
)
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
def left_df(request, df1):
"""Construct left test DataFrame with specified levels
(any of 'outer', 'inner', and 'v1')
"""
levels = request.param
if levels:
df1 = df1.set_index(levels)
return df1
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
def right_df(request, df2):
"""Construct right test DataFrame with specified levels
(any of 'outer', 'inner', and 'v2')
"""
levels = request.param
if levels:
df2 = df2.set_index(levels)
return df2
def compute_expected(df_left, df_right, on=None, left_on=None, right_on=None, how=None):
"""
Compute the expected merge result for the test case.
This method computes the expected result of merging two DataFrames on
a combination of their columns and index levels. It does so by
explicitly dropping/resetting their named index levels, performing a
merge on their columns, and then finally restoring the appropriate
index in the result.
Parameters
----------
df_left : DataFrame
The left DataFrame (may have zero or more named index levels)
df_right : DataFrame
The right DataFrame (may have zero or more named index levels)
on : list of str
The on parameter to the merge operation
left_on : list of str
The left_on parameter to the merge operation
right_on : list of str
The right_on parameter to the merge operation
how : str
The how parameter to the merge operation
Returns
-------
DataFrame
The expected merge result
"""
# Handle on param if specified
if on is not None:
left_on, right_on = on, on
# Compute input named index levels
left_levels = [n for n in df_left.index.names if n is not None]
right_levels = [n for n in df_right.index.names if n is not None]
# Compute output named index levels
output_levels = [i for i in left_on if i in right_levels and i in left_levels]
# Drop index levels that aren't involved in the merge
drop_left = [n for n in left_levels if n not in left_on]
if drop_left:
df_left = df_left.reset_index(drop_left, drop=True)
drop_right = [n for n in right_levels if n not in right_on]
if drop_right:
df_right = df_right.reset_index(drop_right, drop=True)
# Convert remaining index levels to columns
reset_left = [n for n in left_levels if n in left_on]
if reset_left:
df_left = df_left.reset_index(level=reset_left)
reset_right = [n for n in right_levels if n in right_on]
if reset_right:
df_right = df_right.reset_index(level=reset_right)
# Perform merge
expected = df_left.merge(df_right, left_on=left_on, right_on=right_on, how=how)
# Restore index levels
if output_levels:
expected = expected.set_index(output_levels)
return expected
@pytest.mark.parametrize(
"on,how",
[
(["outer"], "inner"),
(["inner"], "left"),
(["outer", "inner"], "right"),
(["inner", "outer"], "outer"),
],
)
def test_merge_indexes_and_columns_on(left_df, right_df, on, how):
# Construct expected result
expected = compute_expected(left_df, right_df, on=on, how=how)
# Perform merge
result = left_df.merge(right_df, on=on, how=how)
tm.assert_frame_equal(result, expected, check_like=True)
@pytest.mark.parametrize(
"left_on,right_on,how",
[
(["outer"], ["outer"], "inner"),
(["inner"], ["inner"], "right"),
(["outer", "inner"], ["outer", "inner"], "left"),
(["inner", "outer"], ["inner", "outer"], "outer"),
],
)
def test_merge_indexes_and_columns_lefton_righton(
left_df, right_df, left_on, right_on, how
):
# Construct expected result
expected = compute_expected(
left_df, right_df, left_on=left_on, right_on=right_on, how=how
)
# Perform merge
result = left_df.merge(right_df, left_on=left_on, right_on=right_on, how=how)
tm.assert_frame_equal(result, expected, check_like=True)
@pytest.mark.parametrize("left_index", ["inner", ["inner", "outer"]])
def test_join_indexes_and_columns_on(df1, df2, left_index, join_type):
# Construct left_df
left_df = df1.set_index(left_index)
# Construct right_df
right_df = df2.set_index(["outer", "inner"])
# Result
expected = (
left_df.reset_index()
.join(
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
)
.set_index(left_index)
)
# Perform join
result = left_df.join(
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
)
tm.assert_frame_equal(result, expected, check_like=True)

View File

@ -0,0 +1,244 @@
import re
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
merge_ordered,
)
import pandas._testing as tm
@pytest.fixture
def left():
return DataFrame({"key": ["a", "c", "e"], "lvalue": [1, 2.0, 3]})
@pytest.fixture
def right():
return DataFrame({"key": ["b", "c", "d", "f"], "rvalue": [1, 2, 3.0, 4]})
class TestMergeOrdered:
def test_basic(self, left, right):
result = merge_ordered(left, right, on="key")
expected = DataFrame(
{
"key": ["a", "b", "c", "d", "e", "f"],
"lvalue": [1, np.nan, 2, np.nan, 3, np.nan],
"rvalue": [np.nan, 1, 2, 3, np.nan, 4],
}
)
tm.assert_frame_equal(result, expected)
def test_ffill(self, left, right):
result = merge_ordered(left, right, on="key", fill_method="ffill")
expected = DataFrame(
{
"key": ["a", "b", "c", "d", "e", "f"],
"lvalue": [1.0, 1, 2, 2, 3, 3.0],
"rvalue": [np.nan, 1, 2, 3, 3, 4],
}
)
tm.assert_frame_equal(result, expected)
def test_multigroup(self, left, right):
left = pd.concat([left, left], ignore_index=True)
left["group"] = ["a"] * 3 + ["b"] * 3
result = merge_ordered(
left, right, on="key", left_by="group", fill_method="ffill"
)
expected = DataFrame(
{
"key": ["a", "b", "c", "d", "e", "f"] * 2,
"lvalue": [1.0, 1, 2, 2, 3, 3.0] * 2,
"rvalue": [np.nan, 1, 2, 3, 3, 4] * 2,
}
)
expected["group"] = ["a"] * 6 + ["b"] * 6
tm.assert_frame_equal(result, expected.loc[:, result.columns])
result2 = merge_ordered(
right, left, on="key", right_by="group", fill_method="ffill"
)
tm.assert_frame_equal(result, result2.loc[:, result.columns])
result = merge_ordered(left, right, on="key", left_by="group")
assert result["group"].notna().all()
@pytest.mark.filterwarnings(
"ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
)
def test_merge_type(self, left, right):
class NotADataFrame(DataFrame):
@property
def _constructor(self):
return NotADataFrame
nad = NotADataFrame(left)
result = nad.merge(right, on="key")
assert isinstance(result, NotADataFrame)
@pytest.mark.parametrize(
"df_seq, pattern",
[
((), "[Nn]o objects"),
([], "[Nn]o objects"),
({}, "[Nn]o objects"),
([None], "objects.*None"),
([None, None], "objects.*None"),
],
)
def test_empty_sequence_concat(self, df_seq, pattern):
# GH 9157
with pytest.raises(ValueError, match=pattern):
pd.concat(df_seq)
@pytest.mark.parametrize(
"arg", [[DataFrame()], [None, DataFrame()], [DataFrame(), None]]
)
def test_empty_sequence_concat_ok(self, arg):
pd.concat(arg)
def test_doc_example(self):
left = DataFrame(
{
"group": list("aaabbb"),
"key": ["a", "c", "e", "a", "c", "e"],
"lvalue": [1, 2, 3] * 2,
}
)
right = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
result = merge_ordered(left, right, fill_method="ffill", left_by="group")
expected = DataFrame(
{
"group": list("aaaaabbbbb"),
"key": ["a", "b", "c", "d", "e"] * 2,
"lvalue": [1, 1, 2, 2, 3] * 2,
"rvalue": [np.nan, 1, 2, 3, 3] * 2,
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"left, right, on, left_by, right_by, expected",
[
(
DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
DataFrame({"T": [2], "E": [1]}),
["T"],
["G", "H"],
None,
DataFrame(
{
"G": ["g"] * 3,
"H": ["h"] * 3,
"T": [1, 2, 3],
"E": [np.nan, 1.0, np.nan],
}
),
),
(
DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
DataFrame({"T": [2], "E": [1]}),
"T",
["G", "H"],
None,
DataFrame(
{
"G": ["g"] * 3,
"H": ["h"] * 3,
"T": [1, 2, 3],
"E": [np.nan, 1.0, np.nan],
}
),
),
(
DataFrame({"T": [2], "E": [1]}),
DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
["T"],
None,
["G", "H"],
DataFrame(
{
"T": [1, 2, 3],
"E": [np.nan, 1.0, np.nan],
"G": ["g"] * 3,
"H": ["h"] * 3,
}
),
),
],
)
def test_list_type_by(self, left, right, on, left_by, right_by, expected):
# GH 35269
result = merge_ordered(
left=left,
right=right,
on=on,
left_by=left_by,
right_by=right_by,
)
tm.assert_frame_equal(result, expected)
def test_left_by_length_equals_to_right_shape0(self):
# GH 38166
left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHE"))
right = DataFrame([[2, 1]], columns=list("ET"))
result = merge_ordered(left, right, on="E", left_by=["G", "H"])
expected = DataFrame(
{"G": ["g"] * 3, "H": ["h"] * 3, "E": [1, 2, 3], "T": [np.nan, 1.0, np.nan]}
)
tm.assert_frame_equal(result, expected)
def test_elements_not_in_by_but_in_df(self):
# GH 38167
left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHE"))
right = DataFrame([[2, 1]], columns=list("ET"))
msg = r"\{'h'\} not found in left columns"
with pytest.raises(KeyError, match=msg):
merge_ordered(left, right, on="E", left_by=["G", "h"])
@pytest.mark.parametrize("invalid_method", ["linear", "carrot"])
def test_ffill_validate_fill_method(self, left, right, invalid_method):
# GH 55884
with pytest.raises(
ValueError, match=re.escape("fill_method must be 'ffill' or None")
):
merge_ordered(left, right, on="key", fill_method=invalid_method)
def test_ffill_left_merge(self):
# GH 57010
df1 = DataFrame(
{
"key": ["a", "c", "e", "a", "c", "e"],
"lvalue": [1, 2, 3, 1, 2, 3],
"group": ["a", "a", "a", "b", "b", "b"],
}
)
df2 = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
result = merge_ordered(
df1, df2, fill_method="ffill", left_by="group", how="left"
)
expected = DataFrame(
{
"key": ["a", "c", "e", "a", "c", "e"],
"lvalue": [1, 2, 3, 1, 2, 3],
"group": ["a", "a", "a", "b", "b", "b"],
"rvalue": [np.nan, 2.0, 2.0, np.nan, 2.0, 2.0],
}
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,934 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
RangeIndex,
Series,
Timestamp,
option_context,
)
import pandas._testing as tm
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import merge
@pytest.fixture
def left():
"""left dataframe (not multi-indexed) for multi-index join tests"""
# a little relevant example with NAs
key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
key2 = ["two", "one", "three", "one", "two", "one", "two", "two", "three", "one"]
data = np.random.default_rng(2).standard_normal(len(key1))
return DataFrame({"key1": key1, "key2": key2, "data": data})
@pytest.fixture
def right(multiindex_dataframe_random_data):
"""right dataframe (multi-indexed) for multi-index join tests"""
df = multiindex_dataframe_random_data
df.index.names = ["key1", "key2"]
df.columns = ["j_one", "j_two", "j_three"]
return df
@pytest.fixture
def left_multi():
return DataFrame(
{
"Origin": ["A", "A", "B", "B", "C"],
"Destination": ["A", "B", "A", "C", "A"],
"Period": ["AM", "AM", "IP", "AM", "OP"],
"TripPurp": ["hbw", "nhb", "hbo", "nhb", "hbw"],
"Trips": [1987, 3647, 2470, 4296, 4444],
},
columns=["Origin", "Destination", "Period", "TripPurp", "Trips"],
).set_index(["Origin", "Destination", "Period", "TripPurp"])
@pytest.fixture
def right_multi():
return DataFrame(
{
"Origin": ["A", "A", "B", "B", "C", "C", "E"],
"Destination": ["A", "B", "A", "B", "A", "B", "F"],
"Period": ["AM", "AM", "IP", "AM", "OP", "IP", "AM"],
"LinkType": ["a", "b", "c", "b", "a", "b", "a"],
"Distance": [100, 80, 90, 80, 75, 35, 55],
},
columns=["Origin", "Destination", "Period", "LinkType", "Distance"],
).set_index(["Origin", "Destination", "Period", "LinkType"])
@pytest.fixture
def on_cols_multi():
return ["Origin", "Destination", "Period"]
class TestMergeMulti:
def test_merge_on_multikey(self, left, right, join_type):
on_cols = ["key1", "key2"]
result = left.join(right, on=on_cols, how=join_type).reset_index(drop=True)
expected = merge(left, right.reset_index(), on=on_cols, how=join_type)
tm.assert_frame_equal(result, expected)
result = left.join(right, on=on_cols, how=join_type, sort=True).reset_index(
drop=True
)
expected = merge(
left, right.reset_index(), on=on_cols, how=join_type, sort=True
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
)
@pytest.mark.parametrize("sort", [True, False])
def test_left_join_multi_index(self, sort, infer_string):
with option_context("future.infer_string", infer_string):
icols = ["1st", "2nd", "3rd"]
def bind_cols(df):
iord = lambda a: 0 if a != a else ord(a)
f = lambda ts: ts.map(iord) - ord("a")
return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10
def run_asserts(left, right, sort):
res = left.join(right, on=icols, how="left", sort=sort)
assert len(left) < len(res) + 1
assert not res["4th"].isna().any()
assert not res["5th"].isna().any()
tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
result = bind_cols(res.iloc[:, :-2])
tm.assert_series_equal(res["4th"], result, check_names=False)
assert result.name is None
if sort:
tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
res.index = RangeIndex(len(res))
tm.assert_frame_equal(out, res)
lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
left = DataFrame(
np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"]
)
# Explicit cast to float to avoid implicit cast when setting nan
left.insert(
1,
"2nd",
np.random.default_rng(2).integers(0, 10, len(left)).astype("float"),
)
i = np.random.default_rng(2).permutation(len(left))
right = left.iloc[i].copy()
left["4th"] = bind_cols(left)
right["5th"] = -bind_cols(right)
right.set_index(icols, inplace=True)
run_asserts(left, right, sort)
# inject some nulls
left.loc[1::4, "1st"] = np.nan
left.loc[2::5, "2nd"] = np.nan
left.loc[3::6, "3rd"] = np.nan
left["4th"] = bind_cols(left)
i = np.random.default_rng(2).permutation(len(left))
right = left.iloc[i, :-1]
right["5th"] = -bind_cols(right)
right.set_index(icols, inplace=True)
run_asserts(left, right, sort)
@pytest.mark.parametrize("sort", [False, True])
def test_merge_right_vs_left(self, left, right, sort):
# compare left vs right merge with multikey
on_cols = ["key1", "key2"]
merged_left_right = left.merge(
right, left_on=on_cols, right_index=True, how="left", sort=sort
)
merge_right_left = right.merge(
left, right_on=on_cols, left_index=True, how="right", sort=sort
)
# Reorder columns
merge_right_left = merge_right_left[merged_left_right.columns]
tm.assert_frame_equal(merged_left_right, merge_right_left)
def test_merge_multiple_cols_with_mixed_cols_index(self):
# GH29522
s = Series(
range(6),
MultiIndex.from_product([["A", "B"], [1, 2, 3]], names=["lev1", "lev2"]),
name="Amount",
)
df = DataFrame({"lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], "col": 0})
result = merge(df, s.reset_index(), on=["lev1", "lev2"])
expected = DataFrame(
{
"lev1": list("AAABBB"),
"lev2": [1, 2, 3, 1, 2, 3],
"col": [0] * 6,
"Amount": range(6),
}
)
tm.assert_frame_equal(result, expected)
def test_compress_group_combinations(self):
# ~ 40000000 possible unique groups
key1 = [str(i) for i in range(10000)]
key1 = np.tile(key1, 2)
key2 = key1[::-1]
df = DataFrame(
{
"key1": key1,
"key2": key2,
"value1": np.random.default_rng(2).standard_normal(20000),
}
)
df2 = DataFrame(
{
"key1": key1[::2],
"key2": key2[::2],
"value2": np.random.default_rng(2).standard_normal(10000),
}
)
# just to hit the label compression code path
merge(df, df2, how="outer")
def test_left_join_index_preserve_order(self):
on_cols = ["k1", "k2"]
left = DataFrame(
{
"k1": [0, 1, 2] * 8,
"k2": ["foo", "bar"] * 12,
"v": np.array(np.arange(24), dtype=np.int64),
}
)
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
right = DataFrame({"v2": [5, 7]}, index=index)
result = left.join(right, on=on_cols)
expected = left.copy()
expected["v2"] = np.nan
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
tm.assert_frame_equal(result, expected)
result.sort_values(on_cols, kind="mergesort", inplace=True)
expected = left.join(right, on=on_cols, sort=True)
tm.assert_frame_equal(result, expected)
# test join with multi dtypes blocks
left = DataFrame(
{
"k1": [0, 1, 2] * 8,
"k2": ["foo", "bar"] * 12,
"k3": np.array([0, 1, 2] * 8, dtype=np.float32),
"v": np.array(np.arange(24), dtype=np.int32),
}
)
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
right = DataFrame({"v2": [5, 7]}, index=index)
result = left.join(right, on=on_cols)
expected = left.copy()
expected["v2"] = np.nan
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
tm.assert_frame_equal(result, expected)
result = result.sort_values(on_cols, kind="mergesort")
expected = left.join(right, on=on_cols, sort=True)
tm.assert_frame_equal(result, expected)
def test_left_join_index_multi_match_multiindex(self):
left = DataFrame(
[
["X", "Y", "C", "a"],
["W", "Y", "C", "e"],
["V", "Q", "A", "h"],
["V", "R", "D", "i"],
["X", "Y", "D", "b"],
["X", "Y", "A", "c"],
["W", "Q", "B", "f"],
["W", "R", "C", "g"],
["V", "Y", "C", "j"],
["X", "Y", "B", "d"],
],
columns=["cola", "colb", "colc", "tag"],
index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8],
)
right = DataFrame(
[
["W", "R", "C", 0],
["W", "Q", "B", 3],
["W", "Q", "B", 8],
["X", "Y", "A", 1],
["X", "Y", "A", 4],
["X", "Y", "B", 5],
["X", "Y", "C", 6],
["X", "Y", "C", 9],
["X", "Q", "C", -6],
["X", "R", "C", -9],
["V", "Y", "C", 7],
["V", "R", "D", 2],
["V", "R", "D", -1],
["V", "Q", "A", -3],
],
columns=["col1", "col2", "col3", "val"],
).set_index(["col1", "col2", "col3"])
result = left.join(right, on=["cola", "colb", "colc"], how="left")
expected = DataFrame(
[
["X", "Y", "C", "a", 6],
["X", "Y", "C", "a", 9],
["W", "Y", "C", "e", np.nan],
["V", "Q", "A", "h", -3],
["V", "R", "D", "i", 2],
["V", "R", "D", "i", -1],
["X", "Y", "D", "b", np.nan],
["X", "Y", "A", "c", 1],
["X", "Y", "A", "c", 4],
["W", "Q", "B", "f", 3],
["W", "Q", "B", "f", 8],
["W", "R", "C", "g", 0],
["V", "Y", "C", "j", 7],
["X", "Y", "B", "d", 5],
],
columns=["cola", "colb", "colc", "tag", "val"],
index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8],
)
tm.assert_frame_equal(result, expected)
result = left.join(right, on=["cola", "colb", "colc"], how="left", sort=True)
expected = expected.sort_values(["cola", "colb", "colc"], kind="mergesort")
tm.assert_frame_equal(result, expected)
def test_left_join_index_multi_match(self):
left = DataFrame(
[["c", 0], ["b", 1], ["a", 2], ["b", 3]],
columns=["tag", "val"],
index=[2, 0, 1, 3],
)
right = DataFrame(
[
["a", "v"],
["c", "w"],
["c", "x"],
["d", "y"],
["a", "z"],
["c", "r"],
["e", "q"],
["c", "s"],
],
columns=["tag", "char"],
).set_index("tag")
result = left.join(right, on="tag", how="left")
expected = DataFrame(
[
["c", 0, "w"],
["c", 0, "x"],
["c", 0, "r"],
["c", 0, "s"],
["b", 1, np.nan],
["a", 2, "v"],
["a", 2, "z"],
["b", 3, np.nan],
],
columns=["tag", "val", "char"],
index=[2, 2, 2, 2, 0, 1, 1, 3],
)
tm.assert_frame_equal(result, expected)
result = left.join(right, on="tag", how="left", sort=True)
expected2 = expected.sort_values("tag", kind="mergesort")
tm.assert_frame_equal(result, expected2)
# GH7331 - maintain left frame order in left merge
result = merge(left, right.reset_index(), how="left", on="tag")
expected.index = RangeIndex(len(expected))
tm.assert_frame_equal(result, expected)
def test_left_merge_na_buglet(self):
left = DataFrame(
{
"id": list("abcde"),
"v1": np.random.default_rng(2).standard_normal(5),
"v2": np.random.default_rng(2).standard_normal(5),
"dummy": list("abcde"),
"v3": np.random.default_rng(2).standard_normal(5),
},
columns=["id", "v1", "v2", "dummy", "v3"],
)
right = DataFrame(
{
"id": ["a", "b", np.nan, np.nan, np.nan],
"sv3": [1.234, 5.678, np.nan, np.nan, np.nan],
}
)
result = merge(left, right, on="id", how="left")
rdf = right.drop(["id"], axis=1)
expected = left.join(rdf)
tm.assert_frame_equal(result, expected)
def test_merge_na_keys(self):
data = [
[1950, "A", 1.5],
[1950, "B", 1.5],
[1955, "B", 1.5],
[1960, "B", np.nan],
[1970, "B", 4.0],
[1950, "C", 4.0],
[1960, "C", np.nan],
[1965, "C", 3.0],
[1970, "C", 4.0],
]
frame = DataFrame(data, columns=["year", "panel", "data"])
other_data = [
[1960, "A", np.nan],
[1970, "A", np.nan],
[1955, "A", np.nan],
[1965, "A", np.nan],
[1965, "B", np.nan],
[1955, "C", np.nan],
]
other = DataFrame(other_data, columns=["year", "panel", "data"])
result = frame.merge(other, how="outer")
expected = frame.fillna(-999).merge(other.fillna(-999), how="outer")
expected = expected.replace(-999, np.nan)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("klass", [None, np.asarray, Series, Index])
def test_merge_datetime_index(self, klass):
# see gh-19038
df = DataFrame(
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
)
df.index = pd.to_datetime(df.index)
on_vector = df.index.year
if klass is not None:
on_vector = klass(on_vector)
exp_years = np.array([2016, 2017, 2018], dtype=np.int32)
expected = DataFrame({"a": [1, 2, 3], "key_1": exp_years})
result = df.merge(df, on=["a", on_vector], how="inner")
tm.assert_frame_equal(result, expected)
expected = DataFrame({"key_0": exp_years, "a_x": [1, 2, 3], "a_y": [1, 2, 3]})
result = df.merge(df, on=[df.index.year], how="inner")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("merge_type", ["left", "right"])
def test_merge_datetime_multi_index_empty_df(self, merge_type):
# see gh-36895
left = DataFrame(
data={
"data": [1.5, 1.5],
},
index=MultiIndex.from_tuples(
[[Timestamp("1950-01-01"), "A"], [Timestamp("1950-01-02"), "B"]],
names=["date", "panel"],
),
)
right = DataFrame(
index=MultiIndex.from_tuples([], names=["date", "panel"]), columns=["state"]
)
expected_index = MultiIndex.from_tuples(
[[Timestamp("1950-01-01"), "A"], [Timestamp("1950-01-02"), "B"]],
names=["date", "panel"],
)
if merge_type == "left":
expected = DataFrame(
data={
"data": [1.5, 1.5],
"state": np.array([np.nan, np.nan], dtype=object),
},
index=expected_index,
)
results_merge = left.merge(right, how="left", on=["date", "panel"])
results_join = left.join(right, how="left")
else:
expected = DataFrame(
data={
"state": np.array([np.nan, np.nan], dtype=object),
"data": [1.5, 1.5],
},
index=expected_index,
)
results_merge = right.merge(left, how="right", on=["date", "panel"])
results_join = right.join(left, how="right")
tm.assert_frame_equal(results_merge, expected)
tm.assert_frame_equal(results_join, expected)
@pytest.fixture
def household(self):
household = DataFrame(
{
"household_id": [1, 2, 3],
"male": [0, 1, 0],
"wealth": [196087.3, 316478.7, 294750],
},
columns=["household_id", "male", "wealth"],
).set_index("household_id")
return household
@pytest.fixture
def portfolio(self):
portfolio = DataFrame(
{
"household_id": [1, 2, 2, 3, 3, 3, 4],
"asset_id": [
"nl0000301109",
"nl0000289783",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"nl0000289965",
np.nan,
],
"name": [
"ABN Amro",
"Robeco",
"Royal Dutch Shell",
"Royal Dutch Shell",
"AAB Eastern Europe Equity Fund",
"Postbank BioTech Fonds",
np.nan,
],
"share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
},
columns=["household_id", "asset_id", "name", "share"],
).set_index(["household_id", "asset_id"])
return portfolio
@pytest.fixture
def expected(self):
expected = (
DataFrame(
{
"male": [0, 1, 1, 0, 0, 0],
"wealth": [
196087.3,
316478.7,
316478.7,
294750.0,
294750.0,
294750.0,
],
"name": [
"ABN Amro",
"Robeco",
"Royal Dutch Shell",
"Royal Dutch Shell",
"AAB Eastern Europe Equity Fund",
"Postbank BioTech Fonds",
],
"share": [1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
"household_id": [1, 2, 2, 3, 3, 3],
"asset_id": [
"nl0000301109",
"nl0000289783",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"nl0000289965",
],
}
)
.set_index(["household_id", "asset_id"])
.reindex(columns=["male", "wealth", "name", "share"])
)
return expected
def test_join_multi_levels(self, portfolio, household, expected):
portfolio = portfolio.copy()
household = household.copy()
# GH 3662
# merge multi-levels
result = household.join(portfolio, how="inner")
tm.assert_frame_equal(result, expected)
def test_join_multi_levels_merge_equivalence(self, portfolio, household, expected):
portfolio = portfolio.copy()
household = household.copy()
# equivalency
result = merge(
household.reset_index(),
portfolio.reset_index(),
on=["household_id"],
how="inner",
).set_index(["household_id", "asset_id"])
tm.assert_frame_equal(result, expected)
def test_join_multi_levels_outer(self, portfolio, household, expected):
portfolio = portfolio.copy()
household = household.copy()
result = household.join(portfolio, how="outer")
expected = concat(
[
expected,
(
DataFrame(
{"share": [1.00]},
index=MultiIndex.from_tuples(
[(4, np.nan)], names=["household_id", "asset_id"]
),
)
),
],
axis=0,
sort=True,
).reindex(columns=expected.columns)
tm.assert_frame_equal(result, expected, check_index_type=False)
def test_join_multi_levels_invalid(self, portfolio, household):
portfolio = portfolio.copy()
household = household.copy()
# invalid cases
household.index.name = "foo"
with pytest.raises(
ValueError, match="cannot join with no overlapping index names"
):
household.join(portfolio, how="inner")
portfolio2 = portfolio.copy()
portfolio2.index.set_names(["household_id", "foo"])
with pytest.raises(ValueError, match="columns overlap but no suffix specified"):
portfolio2.join(portfolio, how="inner")
def test_join_multi_levels2(self):
# some more advanced merges
# GH6360
household = DataFrame(
{
"household_id": [1, 2, 2, 3, 3, 3, 4],
"asset_id": [
"nl0000301109",
"nl0000301109",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"nl0000289965",
np.nan,
],
"share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
},
columns=["household_id", "asset_id", "share"],
).set_index(["household_id", "asset_id"])
log_return = DataFrame(
{
"asset_id": [
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"lu0197800237",
],
"t": [233, 234, 235, 180, 181],
"log_return": [
0.09604978,
-0.06524096,
0.03532373,
0.03025441,
0.036997,
],
}
).set_index(["asset_id", "t"])
expected = (
DataFrame(
{
"household_id": [2, 2, 2, 3, 3, 3, 3, 3],
"asset_id": [
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"lu0197800237",
],
"t": [233, 234, 235, 233, 234, 235, 180, 181],
"share": [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
"log_return": [
0.09604978,
-0.06524096,
0.03532373,
0.09604978,
-0.06524096,
0.03532373,
0.03025441,
0.036997,
],
}
)
.set_index(["household_id", "asset_id", "t"])
.reindex(columns=["share", "log_return"])
)
# this is the equivalency
result = merge(
household.reset_index(),
log_return.reset_index(),
on=["asset_id"],
how="inner",
).set_index(["household_id", "asset_id", "t"])
tm.assert_frame_equal(result, expected)
expected = (
DataFrame(
{
"household_id": [2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 2, 4],
"asset_id": [
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"lu0197800237",
"nl0000289965",
"nl0000301109",
"nl0000301109",
None,
],
"t": [
233,
234,
235,
233,
234,
235,
180,
181,
None,
None,
None,
None,
],
"share": [
0.6,
0.6,
0.6,
0.15,
0.15,
0.15,
0.6,
0.6,
0.25,
1.0,
0.4,
1.0,
],
"log_return": [
0.09604978,
-0.06524096,
0.03532373,
0.09604978,
-0.06524096,
0.03532373,
0.03025441,
0.036997,
None,
None,
None,
None,
],
}
)
.set_index(["household_id", "asset_id", "t"])
.reindex(columns=["share", "log_return"])
)
result = merge(
household.reset_index(),
log_return.reset_index(),
on=["asset_id"],
how="outer",
).set_index(["household_id", "asset_id", "t"])
tm.assert_frame_equal(result, expected)
class TestJoinMultiMulti:
def test_join_multi_multi(self, left_multi, right_multi, join_type, on_cols_multi):
left_names = left_multi.index.names
right_names = right_multi.index.names
if join_type == "right":
level_order = right_names + left_names.difference(right_names)
else:
level_order = left_names + right_names.difference(left_names)
# Multi-index join tests
expected = (
merge(
left_multi.reset_index(),
right_multi.reset_index(),
how=join_type,
on=on_cols_multi,
)
.set_index(level_order)
.sort_index()
)
result = left_multi.join(right_multi, how=join_type).sort_index()
tm.assert_frame_equal(result, expected)
def test_join_multi_empty_frames(
self, left_multi, right_multi, join_type, on_cols_multi
):
left_multi = left_multi.drop(columns=left_multi.columns)
right_multi = right_multi.drop(columns=right_multi.columns)
left_names = left_multi.index.names
right_names = right_multi.index.names
if join_type == "right":
level_order = right_names + left_names.difference(right_names)
else:
level_order = left_names + right_names.difference(left_names)
expected = (
merge(
left_multi.reset_index(),
right_multi.reset_index(),
how=join_type,
on=on_cols_multi,
)
.set_index(level_order)
.sort_index()
)
result = left_multi.join(right_multi, how=join_type).sort_index()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("box", [None, np.asarray, Series, Index])
def test_merge_datetime_index(self, box):
# see gh-19038
df = DataFrame(
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
)
df.index = pd.to_datetime(df.index)
on_vector = df.index.year
if box is not None:
on_vector = box(on_vector)
exp_years = np.array([2016, 2017, 2018], dtype=np.int32)
expected = DataFrame({"a": [1, 2, 3], "key_1": exp_years})
result = df.merge(df, on=["a", on_vector], how="inner")
tm.assert_frame_equal(result, expected)
expected = DataFrame({"key_0": exp_years, "a_x": [1, 2, 3], "a_y": [1, 2, 3]})
result = df.merge(df, on=[df.index.year], how="inner")
tm.assert_frame_equal(result, expected)
def test_single_common_level(self):
index_left = MultiIndex.from_tuples(
[("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"]
)
left = DataFrame(
{"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=index_left
)
index_right = MultiIndex.from_tuples(
[("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"]
)
right = DataFrame(
{"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]},
index=index_right,
)
result = left.join(right)
expected = merge(
left.reset_index(), right.reset_index(), on=["key"], how="inner"
).set_index(["key", "X", "Y"])
tm.assert_frame_equal(result, expected)
def test_join_multi_wrong_order(self):
# GH 25760
# GH 28956
midx1 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
midx3 = MultiIndex.from_tuples([(4, 1), (3, 2), (3, 1)], names=["b", "a"])
left = DataFrame(index=midx1, data={"x": [10, 20, 30, 40]})
right = DataFrame(index=midx3, data={"y": ["foo", "bar", "fing"]})
result = left.join(right)
expected = DataFrame(
index=midx1,
data={"x": [10, 20, 30, 40], "y": ["fing", "foo", "bar", np.nan]},
)
tm.assert_frame_equal(result, expected)