Updated script that can be controled by Nodejs web app
This commit is contained in:
1101
lib/python3.13/site-packages/pandas/tests/reshape/merge/test_join.py
Normal file
1101
lib/python3.13/site-packages/pandas/tests/reshape/merge/test_join.py
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,111 @@
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.reshape.merge import (
|
||||
MergeError,
|
||||
merge,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])]
|
||||
)
|
||||
def test_merge_cross(input_col, output_cols):
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, 3]})
|
||||
right = DataFrame({input_col: [3, 4]})
|
||||
left_copy = left.copy()
|
||||
right_copy = right.copy()
|
||||
result = merge(left, right, how="cross")
|
||||
expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(left, left_copy)
|
||||
tm.assert_frame_equal(right, right_copy)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"left_index": True},
|
||||
{"right_index": True},
|
||||
{"on": "a"},
|
||||
{"left_on": "a"},
|
||||
{"right_on": "b"},
|
||||
],
|
||||
)
|
||||
def test_merge_cross_error_reporting(kwargs):
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, 3]})
|
||||
right = DataFrame({"b": [3, 4]})
|
||||
msg = (
|
||||
"Can not pass on, right_on, left_on or set right_index=True or "
|
||||
"left_index=True"
|
||||
)
|
||||
with pytest.raises(MergeError, match=msg):
|
||||
merge(left, right, how="cross", **kwargs)
|
||||
|
||||
|
||||
def test_merge_cross_mixed_dtypes():
|
||||
# GH#5401
|
||||
left = DataFrame(["a", "b", "c"], columns=["A"])
|
||||
right = DataFrame(range(2), columns=["B"])
|
||||
result = merge(left, right, how="cross")
|
||||
expected = DataFrame({"A": ["a", "a", "b", "b", "c", "c"], "B": [0, 1, 0, 1, 0, 1]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_cross_more_than_one_column():
|
||||
# GH#5401
|
||||
left = DataFrame({"A": list("ab"), "B": [2, 1]})
|
||||
right = DataFrame({"C": range(2), "D": range(4, 6)})
|
||||
result = merge(left, right, how="cross")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": ["a", "a", "b", "b"],
|
||||
"B": [2, 2, 1, 1],
|
||||
"C": [0, 1, 0, 1],
|
||||
"D": [4, 5, 4, 5],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_cross_null_values(nulls_fixture):
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, nulls_fixture]})
|
||||
right = DataFrame({"b": ["a", "b"], "c": [1.0, 2.0]})
|
||||
result = merge(left, right, how="cross")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [1, 1, nulls_fixture, nulls_fixture],
|
||||
"b": ["a", "b", "a", "b"],
|
||||
"c": [1.0, 2.0, 1.0, 2.0],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_join_cross_error_reporting():
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, 3]})
|
||||
right = DataFrame({"a": [3, 4]})
|
||||
msg = (
|
||||
"Can not pass on, right_on, left_on or set right_index=True or "
|
||||
"left_index=True"
|
||||
)
|
||||
with pytest.raises(MergeError, match=msg):
|
||||
left.join(right, how="cross", on="a")
|
||||
|
||||
|
||||
def test_merge_cross_series():
|
||||
# GH#54055
|
||||
ls = Series([1, 2, 3, 4], index=[1, 2, 3, 4], name="left")
|
||||
rs = Series([3, 4, 5, 6], index=[3, 4, 5, 6], name="right")
|
||||
res = merge(ls, rs, how="cross")
|
||||
|
||||
expected = merge(ls.to_frame(), rs.to_frame(), how="cross")
|
||||
tm.assert_frame_equal(res, expected)
|
@ -0,0 +1,186 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df1():
|
||||
return DataFrame(
|
||||
{
|
||||
"outer": [1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4],
|
||||
"inner": [1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2],
|
||||
"v1": np.linspace(0, 1, 11),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df2():
|
||||
return DataFrame(
|
||||
{
|
||||
"outer": [1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3],
|
||||
"inner": [1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3],
|
||||
"v2": np.linspace(10, 11, 12),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
|
||||
def left_df(request, df1):
|
||||
"""Construct left test DataFrame with specified levels
|
||||
(any of 'outer', 'inner', and 'v1')
|
||||
"""
|
||||
levels = request.param
|
||||
if levels:
|
||||
df1 = df1.set_index(levels)
|
||||
|
||||
return df1
|
||||
|
||||
|
||||
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
|
||||
def right_df(request, df2):
|
||||
"""Construct right test DataFrame with specified levels
|
||||
(any of 'outer', 'inner', and 'v2')
|
||||
"""
|
||||
levels = request.param
|
||||
|
||||
if levels:
|
||||
df2 = df2.set_index(levels)
|
||||
|
||||
return df2
|
||||
|
||||
|
||||
def compute_expected(df_left, df_right, on=None, left_on=None, right_on=None, how=None):
|
||||
"""
|
||||
Compute the expected merge result for the test case.
|
||||
|
||||
This method computes the expected result of merging two DataFrames on
|
||||
a combination of their columns and index levels. It does so by
|
||||
explicitly dropping/resetting their named index levels, performing a
|
||||
merge on their columns, and then finally restoring the appropriate
|
||||
index in the result.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_left : DataFrame
|
||||
The left DataFrame (may have zero or more named index levels)
|
||||
df_right : DataFrame
|
||||
The right DataFrame (may have zero or more named index levels)
|
||||
on : list of str
|
||||
The on parameter to the merge operation
|
||||
left_on : list of str
|
||||
The left_on parameter to the merge operation
|
||||
right_on : list of str
|
||||
The right_on parameter to the merge operation
|
||||
how : str
|
||||
The how parameter to the merge operation
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The expected merge result
|
||||
"""
|
||||
# Handle on param if specified
|
||||
if on is not None:
|
||||
left_on, right_on = on, on
|
||||
|
||||
# Compute input named index levels
|
||||
left_levels = [n for n in df_left.index.names if n is not None]
|
||||
right_levels = [n for n in df_right.index.names if n is not None]
|
||||
|
||||
# Compute output named index levels
|
||||
output_levels = [i for i in left_on if i in right_levels and i in left_levels]
|
||||
|
||||
# Drop index levels that aren't involved in the merge
|
||||
drop_left = [n for n in left_levels if n not in left_on]
|
||||
if drop_left:
|
||||
df_left = df_left.reset_index(drop_left, drop=True)
|
||||
|
||||
drop_right = [n for n in right_levels if n not in right_on]
|
||||
if drop_right:
|
||||
df_right = df_right.reset_index(drop_right, drop=True)
|
||||
|
||||
# Convert remaining index levels to columns
|
||||
reset_left = [n for n in left_levels if n in left_on]
|
||||
if reset_left:
|
||||
df_left = df_left.reset_index(level=reset_left)
|
||||
|
||||
reset_right = [n for n in right_levels if n in right_on]
|
||||
if reset_right:
|
||||
df_right = df_right.reset_index(level=reset_right)
|
||||
|
||||
# Perform merge
|
||||
expected = df_left.merge(df_right, left_on=left_on, right_on=right_on, how=how)
|
||||
|
||||
# Restore index levels
|
||||
if output_levels:
|
||||
expected = expected.set_index(output_levels)
|
||||
|
||||
return expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"on,how",
|
||||
[
|
||||
(["outer"], "inner"),
|
||||
(["inner"], "left"),
|
||||
(["outer", "inner"], "right"),
|
||||
(["inner", "outer"], "outer"),
|
||||
],
|
||||
)
|
||||
def test_merge_indexes_and_columns_on(left_df, right_df, on, how):
|
||||
# Construct expected result
|
||||
expected = compute_expected(left_df, right_df, on=on, how=how)
|
||||
|
||||
# Perform merge
|
||||
result = left_df.merge(right_df, on=on, how=how)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"left_on,right_on,how",
|
||||
[
|
||||
(["outer"], ["outer"], "inner"),
|
||||
(["inner"], ["inner"], "right"),
|
||||
(["outer", "inner"], ["outer", "inner"], "left"),
|
||||
(["inner", "outer"], ["inner", "outer"], "outer"),
|
||||
],
|
||||
)
|
||||
def test_merge_indexes_and_columns_lefton_righton(
|
||||
left_df, right_df, left_on, right_on, how
|
||||
):
|
||||
# Construct expected result
|
||||
expected = compute_expected(
|
||||
left_df, right_df, left_on=left_on, right_on=right_on, how=how
|
||||
)
|
||||
|
||||
# Perform merge
|
||||
result = left_df.merge(right_df, left_on=left_on, right_on=right_on, how=how)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("left_index", ["inner", ["inner", "outer"]])
|
||||
def test_join_indexes_and_columns_on(df1, df2, left_index, join_type):
|
||||
# Construct left_df
|
||||
left_df = df1.set_index(left_index)
|
||||
|
||||
# Construct right_df
|
||||
right_df = df2.set_index(["outer", "inner"])
|
||||
|
||||
# Result
|
||||
expected = (
|
||||
left_df.reset_index()
|
||||
.join(
|
||||
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
|
||||
)
|
||||
.set_index(left_index)
|
||||
)
|
||||
|
||||
# Perform join
|
||||
result = left_df.join(
|
||||
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
@ -0,0 +1,244 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
merge_ordered,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left():
|
||||
return DataFrame({"key": ["a", "c", "e"], "lvalue": [1, 2.0, 3]})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right():
|
||||
return DataFrame({"key": ["b", "c", "d", "f"], "rvalue": [1, 2, 3.0, 4]})
|
||||
|
||||
|
||||
class TestMergeOrdered:
|
||||
def test_basic(self, left, right):
|
||||
result = merge_ordered(left, right, on="key")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "b", "c", "d", "e", "f"],
|
||||
"lvalue": [1, np.nan, 2, np.nan, 3, np.nan],
|
||||
"rvalue": [np.nan, 1, 2, 3, np.nan, 4],
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_ffill(self, left, right):
|
||||
result = merge_ordered(left, right, on="key", fill_method="ffill")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "b", "c", "d", "e", "f"],
|
||||
"lvalue": [1.0, 1, 2, 2, 3, 3.0],
|
||||
"rvalue": [np.nan, 1, 2, 3, 3, 4],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_multigroup(self, left, right):
|
||||
left = pd.concat([left, left], ignore_index=True)
|
||||
|
||||
left["group"] = ["a"] * 3 + ["b"] * 3
|
||||
|
||||
result = merge_ordered(
|
||||
left, right, on="key", left_by="group", fill_method="ffill"
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "b", "c", "d", "e", "f"] * 2,
|
||||
"lvalue": [1.0, 1, 2, 2, 3, 3.0] * 2,
|
||||
"rvalue": [np.nan, 1, 2, 3, 3, 4] * 2,
|
||||
}
|
||||
)
|
||||
expected["group"] = ["a"] * 6 + ["b"] * 6
|
||||
|
||||
tm.assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
result2 = merge_ordered(
|
||||
right, left, on="key", right_by="group", fill_method="ffill"
|
||||
)
|
||||
tm.assert_frame_equal(result, result2.loc[:, result.columns])
|
||||
|
||||
result = merge_ordered(left, right, on="key", left_by="group")
|
||||
assert result["group"].notna().all()
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
|
||||
)
|
||||
def test_merge_type(self, left, right):
|
||||
class NotADataFrame(DataFrame):
|
||||
@property
|
||||
def _constructor(self):
|
||||
return NotADataFrame
|
||||
|
||||
nad = NotADataFrame(left)
|
||||
result = nad.merge(right, on="key")
|
||||
|
||||
assert isinstance(result, NotADataFrame)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df_seq, pattern",
|
||||
[
|
||||
((), "[Nn]o objects"),
|
||||
([], "[Nn]o objects"),
|
||||
({}, "[Nn]o objects"),
|
||||
([None], "objects.*None"),
|
||||
([None, None], "objects.*None"),
|
||||
],
|
||||
)
|
||||
def test_empty_sequence_concat(self, df_seq, pattern):
|
||||
# GH 9157
|
||||
with pytest.raises(ValueError, match=pattern):
|
||||
pd.concat(df_seq)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg", [[DataFrame()], [None, DataFrame()], [DataFrame(), None]]
|
||||
)
|
||||
def test_empty_sequence_concat_ok(self, arg):
|
||||
pd.concat(arg)
|
||||
|
||||
def test_doc_example(self):
|
||||
left = DataFrame(
|
||||
{
|
||||
"group": list("aaabbb"),
|
||||
"key": ["a", "c", "e", "a", "c", "e"],
|
||||
"lvalue": [1, 2, 3] * 2,
|
||||
}
|
||||
)
|
||||
|
||||
right = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
|
||||
|
||||
result = merge_ordered(left, right, fill_method="ffill", left_by="group")
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"group": list("aaaaabbbbb"),
|
||||
"key": ["a", "b", "c", "d", "e"] * 2,
|
||||
"lvalue": [1, 1, 2, 2, 3] * 2,
|
||||
"rvalue": [np.nan, 1, 2, 3, 3] * 2,
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"left, right, on, left_by, right_by, expected",
|
||||
[
|
||||
(
|
||||
DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
|
||||
DataFrame({"T": [2], "E": [1]}),
|
||||
["T"],
|
||||
["G", "H"],
|
||||
None,
|
||||
DataFrame(
|
||||
{
|
||||
"G": ["g"] * 3,
|
||||
"H": ["h"] * 3,
|
||||
"T": [1, 2, 3],
|
||||
"E": [np.nan, 1.0, np.nan],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
|
||||
DataFrame({"T": [2], "E": [1]}),
|
||||
"T",
|
||||
["G", "H"],
|
||||
None,
|
||||
DataFrame(
|
||||
{
|
||||
"G": ["g"] * 3,
|
||||
"H": ["h"] * 3,
|
||||
"T": [1, 2, 3],
|
||||
"E": [np.nan, 1.0, np.nan],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
DataFrame({"T": [2], "E": [1]}),
|
||||
DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
|
||||
["T"],
|
||||
None,
|
||||
["G", "H"],
|
||||
DataFrame(
|
||||
{
|
||||
"T": [1, 2, 3],
|
||||
"E": [np.nan, 1.0, np.nan],
|
||||
"G": ["g"] * 3,
|
||||
"H": ["h"] * 3,
|
||||
}
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_list_type_by(self, left, right, on, left_by, right_by, expected):
|
||||
# GH 35269
|
||||
result = merge_ordered(
|
||||
left=left,
|
||||
right=right,
|
||||
on=on,
|
||||
left_by=left_by,
|
||||
right_by=right_by,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_by_length_equals_to_right_shape0(self):
|
||||
# GH 38166
|
||||
left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHE"))
|
||||
right = DataFrame([[2, 1]], columns=list("ET"))
|
||||
result = merge_ordered(left, right, on="E", left_by=["G", "H"])
|
||||
expected = DataFrame(
|
||||
{"G": ["g"] * 3, "H": ["h"] * 3, "E": [1, 2, 3], "T": [np.nan, 1.0, np.nan]}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_elements_not_in_by_but_in_df(self):
|
||||
# GH 38167
|
||||
left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHE"))
|
||||
right = DataFrame([[2, 1]], columns=list("ET"))
|
||||
msg = r"\{'h'\} not found in left columns"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
merge_ordered(left, right, on="E", left_by=["G", "h"])
|
||||
|
||||
@pytest.mark.parametrize("invalid_method", ["linear", "carrot"])
|
||||
def test_ffill_validate_fill_method(self, left, right, invalid_method):
|
||||
# GH 55884
|
||||
with pytest.raises(
|
||||
ValueError, match=re.escape("fill_method must be 'ffill' or None")
|
||||
):
|
||||
merge_ordered(left, right, on="key", fill_method=invalid_method)
|
||||
|
||||
def test_ffill_left_merge(self):
|
||||
# GH 57010
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"key": ["a", "c", "e", "a", "c", "e"],
|
||||
"lvalue": [1, 2, 3, 1, 2, 3],
|
||||
"group": ["a", "a", "a", "b", "b", "b"],
|
||||
}
|
||||
)
|
||||
df2 = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
|
||||
result = merge_ordered(
|
||||
df1, df2, fill_method="ffill", left_by="group", how="left"
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "c", "e", "a", "c", "e"],
|
||||
"lvalue": [1, 2, 3, 1, 2, 3],
|
||||
"group": ["a", "a", "a", "b", "b", "b"],
|
||||
"rvalue": [np.nan, 2.0, 2.0, np.nan, 2.0, 2.0],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,934 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
RangeIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.reshape.merge import merge
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left():
|
||||
"""left dataframe (not multi-indexed) for multi-index join tests"""
|
||||
# a little relevant example with NAs
|
||||
key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
|
||||
key2 = ["two", "one", "three", "one", "two", "one", "two", "two", "three", "one"]
|
||||
|
||||
data = np.random.default_rng(2).standard_normal(len(key1))
|
||||
return DataFrame({"key1": key1, "key2": key2, "data": data})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right(multiindex_dataframe_random_data):
|
||||
"""right dataframe (multi-indexed) for multi-index join tests"""
|
||||
df = multiindex_dataframe_random_data
|
||||
df.index.names = ["key1", "key2"]
|
||||
|
||||
df.columns = ["j_one", "j_two", "j_three"]
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left_multi():
|
||||
return DataFrame(
|
||||
{
|
||||
"Origin": ["A", "A", "B", "B", "C"],
|
||||
"Destination": ["A", "B", "A", "C", "A"],
|
||||
"Period": ["AM", "AM", "IP", "AM", "OP"],
|
||||
"TripPurp": ["hbw", "nhb", "hbo", "nhb", "hbw"],
|
||||
"Trips": [1987, 3647, 2470, 4296, 4444],
|
||||
},
|
||||
columns=["Origin", "Destination", "Period", "TripPurp", "Trips"],
|
||||
).set_index(["Origin", "Destination", "Period", "TripPurp"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right_multi():
|
||||
return DataFrame(
|
||||
{
|
||||
"Origin": ["A", "A", "B", "B", "C", "C", "E"],
|
||||
"Destination": ["A", "B", "A", "B", "A", "B", "F"],
|
||||
"Period": ["AM", "AM", "IP", "AM", "OP", "IP", "AM"],
|
||||
"LinkType": ["a", "b", "c", "b", "a", "b", "a"],
|
||||
"Distance": [100, 80, 90, 80, 75, 35, 55],
|
||||
},
|
||||
columns=["Origin", "Destination", "Period", "LinkType", "Distance"],
|
||||
).set_index(["Origin", "Destination", "Period", "LinkType"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def on_cols_multi():
|
||||
return ["Origin", "Destination", "Period"]
|
||||
|
||||
|
||||
class TestMergeMulti:
|
||||
def test_merge_on_multikey(self, left, right, join_type):
|
||||
on_cols = ["key1", "key2"]
|
||||
result = left.join(right, on=on_cols, how=join_type).reset_index(drop=True)
|
||||
|
||||
expected = merge(left, right.reset_index(), on=on_cols, how=join_type)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on=on_cols, how=join_type, sort=True).reset_index(
|
||||
drop=True
|
||||
)
|
||||
|
||||
expected = merge(
|
||||
left, right.reset_index(), on=on_cols, how=join_type, sort=True
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
|
||||
)
|
||||
@pytest.mark.parametrize("sort", [True, False])
|
||||
def test_left_join_multi_index(self, sort, infer_string):
|
||||
with option_context("future.infer_string", infer_string):
|
||||
icols = ["1st", "2nd", "3rd"]
|
||||
|
||||
def bind_cols(df):
|
||||
iord = lambda a: 0 if a != a else ord(a)
|
||||
f = lambda ts: ts.map(iord) - ord("a")
|
||||
return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10
|
||||
|
||||
def run_asserts(left, right, sort):
|
||||
res = left.join(right, on=icols, how="left", sort=sort)
|
||||
|
||||
assert len(left) < len(res) + 1
|
||||
assert not res["4th"].isna().any()
|
||||
assert not res["5th"].isna().any()
|
||||
|
||||
tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
|
||||
result = bind_cols(res.iloc[:, :-2])
|
||||
tm.assert_series_equal(res["4th"], result, check_names=False)
|
||||
assert result.name is None
|
||||
|
||||
if sort:
|
||||
tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
|
||||
|
||||
out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
|
||||
|
||||
res.index = RangeIndex(len(res))
|
||||
tm.assert_frame_equal(out, res)
|
||||
|
||||
lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
|
||||
left = DataFrame(
|
||||
np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"]
|
||||
)
|
||||
# Explicit cast to float to avoid implicit cast when setting nan
|
||||
left.insert(
|
||||
1,
|
||||
"2nd",
|
||||
np.random.default_rng(2).integers(0, 10, len(left)).astype("float"),
|
||||
)
|
||||
|
||||
i = np.random.default_rng(2).permutation(len(left))
|
||||
right = left.iloc[i].copy()
|
||||
|
||||
left["4th"] = bind_cols(left)
|
||||
right["5th"] = -bind_cols(right)
|
||||
right.set_index(icols, inplace=True)
|
||||
|
||||
run_asserts(left, right, sort)
|
||||
|
||||
# inject some nulls
|
||||
left.loc[1::4, "1st"] = np.nan
|
||||
left.loc[2::5, "2nd"] = np.nan
|
||||
left.loc[3::6, "3rd"] = np.nan
|
||||
left["4th"] = bind_cols(left)
|
||||
|
||||
i = np.random.default_rng(2).permutation(len(left))
|
||||
right = left.iloc[i, :-1]
|
||||
right["5th"] = -bind_cols(right)
|
||||
right.set_index(icols, inplace=True)
|
||||
|
||||
run_asserts(left, right, sort)
|
||||
|
||||
@pytest.mark.parametrize("sort", [False, True])
|
||||
def test_merge_right_vs_left(self, left, right, sort):
|
||||
# compare left vs right merge with multikey
|
||||
on_cols = ["key1", "key2"]
|
||||
merged_left_right = left.merge(
|
||||
right, left_on=on_cols, right_index=True, how="left", sort=sort
|
||||
)
|
||||
|
||||
merge_right_left = right.merge(
|
||||
left, right_on=on_cols, left_index=True, how="right", sort=sort
|
||||
)
|
||||
|
||||
# Reorder columns
|
||||
merge_right_left = merge_right_left[merged_left_right.columns]
|
||||
|
||||
tm.assert_frame_equal(merged_left_right, merge_right_left)
|
||||
|
||||
def test_merge_multiple_cols_with_mixed_cols_index(self):
|
||||
# GH29522
|
||||
s = Series(
|
||||
range(6),
|
||||
MultiIndex.from_product([["A", "B"], [1, 2, 3]], names=["lev1", "lev2"]),
|
||||
name="Amount",
|
||||
)
|
||||
df = DataFrame({"lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], "col": 0})
|
||||
result = merge(df, s.reset_index(), on=["lev1", "lev2"])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"lev1": list("AAABBB"),
|
||||
"lev2": [1, 2, 3, 1, 2, 3],
|
||||
"col": [0] * 6,
|
||||
"Amount": range(6),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_compress_group_combinations(self):
|
||||
# ~ 40000000 possible unique groups
|
||||
key1 = [str(i) for i in range(10000)]
|
||||
key1 = np.tile(key1, 2)
|
||||
key2 = key1[::-1]
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"key1": key1,
|
||||
"key2": key2,
|
||||
"value1": np.random.default_rng(2).standard_normal(20000),
|
||||
}
|
||||
)
|
||||
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"key1": key1[::2],
|
||||
"key2": key2[::2],
|
||||
"value2": np.random.default_rng(2).standard_normal(10000),
|
||||
}
|
||||
)
|
||||
|
||||
# just to hit the label compression code path
|
||||
merge(df, df2, how="outer")
|
||||
|
||||
def test_left_join_index_preserve_order(self):
|
||||
on_cols = ["k1", "k2"]
|
||||
left = DataFrame(
|
||||
{
|
||||
"k1": [0, 1, 2] * 8,
|
||||
"k2": ["foo", "bar"] * 12,
|
||||
"v": np.array(np.arange(24), dtype=np.int64),
|
||||
}
|
||||
)
|
||||
|
||||
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
|
||||
right = DataFrame({"v2": [5, 7]}, index=index)
|
||||
|
||||
result = left.join(right, on=on_cols)
|
||||
|
||||
expected = left.copy()
|
||||
expected["v2"] = np.nan
|
||||
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
|
||||
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result.sort_values(on_cols, kind="mergesort", inplace=True)
|
||||
expected = left.join(right, on=on_cols, sort=True)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# test join with multi dtypes blocks
|
||||
left = DataFrame(
|
||||
{
|
||||
"k1": [0, 1, 2] * 8,
|
||||
"k2": ["foo", "bar"] * 12,
|
||||
"k3": np.array([0, 1, 2] * 8, dtype=np.float32),
|
||||
"v": np.array(np.arange(24), dtype=np.int32),
|
||||
}
|
||||
)
|
||||
|
||||
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
|
||||
right = DataFrame({"v2": [5, 7]}, index=index)
|
||||
|
||||
result = left.join(right, on=on_cols)
|
||||
|
||||
expected = left.copy()
|
||||
expected["v2"] = np.nan
|
||||
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
|
||||
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = result.sort_values(on_cols, kind="mergesort")
|
||||
expected = left.join(right, on=on_cols, sort=True)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_join_index_multi_match_multiindex(self):
|
||||
left = DataFrame(
|
||||
[
|
||||
["X", "Y", "C", "a"],
|
||||
["W", "Y", "C", "e"],
|
||||
["V", "Q", "A", "h"],
|
||||
["V", "R", "D", "i"],
|
||||
["X", "Y", "D", "b"],
|
||||
["X", "Y", "A", "c"],
|
||||
["W", "Q", "B", "f"],
|
||||
["W", "R", "C", "g"],
|
||||
["V", "Y", "C", "j"],
|
||||
["X", "Y", "B", "d"],
|
||||
],
|
||||
columns=["cola", "colb", "colc", "tag"],
|
||||
index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8],
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
[
|
||||
["W", "R", "C", 0],
|
||||
["W", "Q", "B", 3],
|
||||
["W", "Q", "B", 8],
|
||||
["X", "Y", "A", 1],
|
||||
["X", "Y", "A", 4],
|
||||
["X", "Y", "B", 5],
|
||||
["X", "Y", "C", 6],
|
||||
["X", "Y", "C", 9],
|
||||
["X", "Q", "C", -6],
|
||||
["X", "R", "C", -9],
|
||||
["V", "Y", "C", 7],
|
||||
["V", "R", "D", 2],
|
||||
["V", "R", "D", -1],
|
||||
["V", "Q", "A", -3],
|
||||
],
|
||||
columns=["col1", "col2", "col3", "val"],
|
||||
).set_index(["col1", "col2", "col3"])
|
||||
|
||||
result = left.join(right, on=["cola", "colb", "colc"], how="left")
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
["X", "Y", "C", "a", 6],
|
||||
["X", "Y", "C", "a", 9],
|
||||
["W", "Y", "C", "e", np.nan],
|
||||
["V", "Q", "A", "h", -3],
|
||||
["V", "R", "D", "i", 2],
|
||||
["V", "R", "D", "i", -1],
|
||||
["X", "Y", "D", "b", np.nan],
|
||||
["X", "Y", "A", "c", 1],
|
||||
["X", "Y", "A", "c", 4],
|
||||
["W", "Q", "B", "f", 3],
|
||||
["W", "Q", "B", "f", 8],
|
||||
["W", "R", "C", "g", 0],
|
||||
["V", "Y", "C", "j", 7],
|
||||
["X", "Y", "B", "d", 5],
|
||||
],
|
||||
columns=["cola", "colb", "colc", "tag", "val"],
|
||||
index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on=["cola", "colb", "colc"], how="left", sort=True)
|
||||
|
||||
expected = expected.sort_values(["cola", "colb", "colc"], kind="mergesort")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_join_index_multi_match(self):
|
||||
left = DataFrame(
|
||||
[["c", 0], ["b", 1], ["a", 2], ["b", 3]],
|
||||
columns=["tag", "val"],
|
||||
index=[2, 0, 1, 3],
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
[
|
||||
["a", "v"],
|
||||
["c", "w"],
|
||||
["c", "x"],
|
||||
["d", "y"],
|
||||
["a", "z"],
|
||||
["c", "r"],
|
||||
["e", "q"],
|
||||
["c", "s"],
|
||||
],
|
||||
columns=["tag", "char"],
|
||||
).set_index("tag")
|
||||
|
||||
result = left.join(right, on="tag", how="left")
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
["c", 0, "w"],
|
||||
["c", 0, "x"],
|
||||
["c", 0, "r"],
|
||||
["c", 0, "s"],
|
||||
["b", 1, np.nan],
|
||||
["a", 2, "v"],
|
||||
["a", 2, "z"],
|
||||
["b", 3, np.nan],
|
||||
],
|
||||
columns=["tag", "val", "char"],
|
||||
index=[2, 2, 2, 2, 0, 1, 1, 3],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on="tag", how="left", sort=True)
|
||||
expected2 = expected.sort_values("tag", kind="mergesort")
|
||||
|
||||
tm.assert_frame_equal(result, expected2)
|
||||
|
||||
# GH7331 - maintain left frame order in left merge
|
||||
result = merge(left, right.reset_index(), how="left", on="tag")
|
||||
expected.index = RangeIndex(len(expected))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_merge_na_buglet(self):
|
||||
left = DataFrame(
|
||||
{
|
||||
"id": list("abcde"),
|
||||
"v1": np.random.default_rng(2).standard_normal(5),
|
||||
"v2": np.random.default_rng(2).standard_normal(5),
|
||||
"dummy": list("abcde"),
|
||||
"v3": np.random.default_rng(2).standard_normal(5),
|
||||
},
|
||||
columns=["id", "v1", "v2", "dummy", "v3"],
|
||||
)
|
||||
right = DataFrame(
|
||||
{
|
||||
"id": ["a", "b", np.nan, np.nan, np.nan],
|
||||
"sv3": [1.234, 5.678, np.nan, np.nan, np.nan],
|
||||
}
|
||||
)
|
||||
|
||||
result = merge(left, right, on="id", how="left")
|
||||
|
||||
rdf = right.drop(["id"], axis=1)
|
||||
expected = left.join(rdf)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_merge_na_keys(self):
|
||||
data = [
|
||||
[1950, "A", 1.5],
|
||||
[1950, "B", 1.5],
|
||||
[1955, "B", 1.5],
|
||||
[1960, "B", np.nan],
|
||||
[1970, "B", 4.0],
|
||||
[1950, "C", 4.0],
|
||||
[1960, "C", np.nan],
|
||||
[1965, "C", 3.0],
|
||||
[1970, "C", 4.0],
|
||||
]
|
||||
|
||||
frame = DataFrame(data, columns=["year", "panel", "data"])
|
||||
|
||||
other_data = [
|
||||
[1960, "A", np.nan],
|
||||
[1970, "A", np.nan],
|
||||
[1955, "A", np.nan],
|
||||
[1965, "A", np.nan],
|
||||
[1965, "B", np.nan],
|
||||
[1955, "C", np.nan],
|
||||
]
|
||||
other = DataFrame(other_data, columns=["year", "panel", "data"])
|
||||
|
||||
result = frame.merge(other, how="outer")
|
||||
|
||||
expected = frame.fillna(-999).merge(other.fillna(-999), how="outer")
|
||||
expected = expected.replace(-999, np.nan)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("klass", [None, np.asarray, Series, Index])
|
||||
def test_merge_datetime_index(self, klass):
|
||||
# see gh-19038
|
||||
df = DataFrame(
|
||||
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
|
||||
)
|
||||
df.index = pd.to_datetime(df.index)
|
||||
on_vector = df.index.year
|
||||
|
||||
if klass is not None:
|
||||
on_vector = klass(on_vector)
|
||||
|
||||
exp_years = np.array([2016, 2017, 2018], dtype=np.int32)
|
||||
expected = DataFrame({"a": [1, 2, 3], "key_1": exp_years})
|
||||
|
||||
result = df.merge(df, on=["a", on_vector], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"key_0": exp_years, "a_x": [1, 2, 3], "a_y": [1, 2, 3]})
|
||||
|
||||
result = df.merge(df, on=[df.index.year], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("merge_type", ["left", "right"])
|
||||
def test_merge_datetime_multi_index_empty_df(self, merge_type):
|
||||
# see gh-36895
|
||||
|
||||
left = DataFrame(
|
||||
data={
|
||||
"data": [1.5, 1.5],
|
||||
},
|
||||
index=MultiIndex.from_tuples(
|
||||
[[Timestamp("1950-01-01"), "A"], [Timestamp("1950-01-02"), "B"]],
|
||||
names=["date", "panel"],
|
||||
),
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
index=MultiIndex.from_tuples([], names=["date", "panel"]), columns=["state"]
|
||||
)
|
||||
|
||||
expected_index = MultiIndex.from_tuples(
|
||||
[[Timestamp("1950-01-01"), "A"], [Timestamp("1950-01-02"), "B"]],
|
||||
names=["date", "panel"],
|
||||
)
|
||||
|
||||
if merge_type == "left":
|
||||
expected = DataFrame(
|
||||
data={
|
||||
"data": [1.5, 1.5],
|
||||
"state": np.array([np.nan, np.nan], dtype=object),
|
||||
},
|
||||
index=expected_index,
|
||||
)
|
||||
results_merge = left.merge(right, how="left", on=["date", "panel"])
|
||||
results_join = left.join(right, how="left")
|
||||
else:
|
||||
expected = DataFrame(
|
||||
data={
|
||||
"state": np.array([np.nan, np.nan], dtype=object),
|
||||
"data": [1.5, 1.5],
|
||||
},
|
||||
index=expected_index,
|
||||
)
|
||||
results_merge = right.merge(left, how="right", on=["date", "panel"])
|
||||
results_join = right.join(left, how="right")
|
||||
|
||||
tm.assert_frame_equal(results_merge, expected)
|
||||
tm.assert_frame_equal(results_join, expected)
|
||||
|
||||
@pytest.fixture
|
||||
def household(self):
|
||||
household = DataFrame(
|
||||
{
|
||||
"household_id": [1, 2, 3],
|
||||
"male": [0, 1, 0],
|
||||
"wealth": [196087.3, 316478.7, 294750],
|
||||
},
|
||||
columns=["household_id", "male", "wealth"],
|
||||
).set_index("household_id")
|
||||
return household
|
||||
|
||||
@pytest.fixture
|
||||
def portfolio(self):
|
||||
portfolio = DataFrame(
|
||||
{
|
||||
"household_id": [1, 2, 2, 3, 3, 3, 4],
|
||||
"asset_id": [
|
||||
"nl0000301109",
|
||||
"nl0000289783",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
np.nan,
|
||||
],
|
||||
"name": [
|
||||
"ABN Amro",
|
||||
"Robeco",
|
||||
"Royal Dutch Shell",
|
||||
"Royal Dutch Shell",
|
||||
"AAB Eastern Europe Equity Fund",
|
||||
"Postbank BioTech Fonds",
|
||||
np.nan,
|
||||
],
|
||||
"share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
|
||||
},
|
||||
columns=["household_id", "asset_id", "name", "share"],
|
||||
).set_index(["household_id", "asset_id"])
|
||||
return portfolio
|
||||
|
||||
@pytest.fixture
|
||||
def expected(self):
|
||||
expected = (
|
||||
DataFrame(
|
||||
{
|
||||
"male": [0, 1, 1, 0, 0, 0],
|
||||
"wealth": [
|
||||
196087.3,
|
||||
316478.7,
|
||||
316478.7,
|
||||
294750.0,
|
||||
294750.0,
|
||||
294750.0,
|
||||
],
|
||||
"name": [
|
||||
"ABN Amro",
|
||||
"Robeco",
|
||||
"Royal Dutch Shell",
|
||||
"Royal Dutch Shell",
|
||||
"AAB Eastern Europe Equity Fund",
|
||||
"Postbank BioTech Fonds",
|
||||
],
|
||||
"share": [1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
|
||||
"household_id": [1, 2, 2, 3, 3, 3],
|
||||
"asset_id": [
|
||||
"nl0000301109",
|
||||
"nl0000289783",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
],
|
||||
}
|
||||
)
|
||||
.set_index(["household_id", "asset_id"])
|
||||
.reindex(columns=["male", "wealth", "name", "share"])
|
||||
)
|
||||
return expected
|
||||
|
||||
def test_join_multi_levels(self, portfolio, household, expected):
|
||||
portfolio = portfolio.copy()
|
||||
household = household.copy()
|
||||
|
||||
# GH 3662
|
||||
# merge multi-levels
|
||||
result = household.join(portfolio, how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_levels_merge_equivalence(self, portfolio, household, expected):
|
||||
portfolio = portfolio.copy()
|
||||
household = household.copy()
|
||||
|
||||
# equivalency
|
||||
result = merge(
|
||||
household.reset_index(),
|
||||
portfolio.reset_index(),
|
||||
on=["household_id"],
|
||||
how="inner",
|
||||
).set_index(["household_id", "asset_id"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_levels_outer(self, portfolio, household, expected):
|
||||
portfolio = portfolio.copy()
|
||||
household = household.copy()
|
||||
|
||||
result = household.join(portfolio, how="outer")
|
||||
expected = concat(
|
||||
[
|
||||
expected,
|
||||
(
|
||||
DataFrame(
|
||||
{"share": [1.00]},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(4, np.nan)], names=["household_id", "asset_id"]
|
||||
),
|
||||
)
|
||||
),
|
||||
],
|
||||
axis=0,
|
||||
sort=True,
|
||||
).reindex(columns=expected.columns)
|
||||
tm.assert_frame_equal(result, expected, check_index_type=False)
|
||||
|
||||
def test_join_multi_levels_invalid(self, portfolio, household):
|
||||
portfolio = portfolio.copy()
|
||||
household = household.copy()
|
||||
|
||||
# invalid cases
|
||||
household.index.name = "foo"
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="cannot join with no overlapping index names"
|
||||
):
|
||||
household.join(portfolio, how="inner")
|
||||
|
||||
portfolio2 = portfolio.copy()
|
||||
portfolio2.index.set_names(["household_id", "foo"])
|
||||
|
||||
with pytest.raises(ValueError, match="columns overlap but no suffix specified"):
|
||||
portfolio2.join(portfolio, how="inner")
|
||||
|
||||
def test_join_multi_levels2(self):
|
||||
# some more advanced merges
|
||||
# GH6360
|
||||
household = DataFrame(
|
||||
{
|
||||
"household_id": [1, 2, 2, 3, 3, 3, 4],
|
||||
"asset_id": [
|
||||
"nl0000301109",
|
||||
"nl0000301109",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
np.nan,
|
||||
],
|
||||
"share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
|
||||
},
|
||||
columns=["household_id", "asset_id", "share"],
|
||||
).set_index(["household_id", "asset_id"])
|
||||
|
||||
log_return = DataFrame(
|
||||
{
|
||||
"asset_id": [
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"lu0197800237",
|
||||
],
|
||||
"t": [233, 234, 235, 180, 181],
|
||||
"log_return": [
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.03025441,
|
||||
0.036997,
|
||||
],
|
||||
}
|
||||
).set_index(["asset_id", "t"])
|
||||
|
||||
expected = (
|
||||
DataFrame(
|
||||
{
|
||||
"household_id": [2, 2, 2, 3, 3, 3, 3, 3],
|
||||
"asset_id": [
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"lu0197800237",
|
||||
],
|
||||
"t": [233, 234, 235, 233, 234, 235, 180, 181],
|
||||
"share": [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
|
||||
"log_return": [
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.03025441,
|
||||
0.036997,
|
||||
],
|
||||
}
|
||||
)
|
||||
.set_index(["household_id", "asset_id", "t"])
|
||||
.reindex(columns=["share", "log_return"])
|
||||
)
|
||||
|
||||
# this is the equivalency
|
||||
result = merge(
|
||||
household.reset_index(),
|
||||
log_return.reset_index(),
|
||||
on=["asset_id"],
|
||||
how="inner",
|
||||
).set_index(["household_id", "asset_id", "t"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = (
|
||||
DataFrame(
|
||||
{
|
||||
"household_id": [2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 2, 4],
|
||||
"asset_id": [
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
"nl0000301109",
|
||||
"nl0000301109",
|
||||
None,
|
||||
],
|
||||
"t": [
|
||||
233,
|
||||
234,
|
||||
235,
|
||||
233,
|
||||
234,
|
||||
235,
|
||||
180,
|
||||
181,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
],
|
||||
"share": [
|
||||
0.6,
|
||||
0.6,
|
||||
0.6,
|
||||
0.15,
|
||||
0.15,
|
||||
0.15,
|
||||
0.6,
|
||||
0.6,
|
||||
0.25,
|
||||
1.0,
|
||||
0.4,
|
||||
1.0,
|
||||
],
|
||||
"log_return": [
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.03025441,
|
||||
0.036997,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
],
|
||||
}
|
||||
)
|
||||
.set_index(["household_id", "asset_id", "t"])
|
||||
.reindex(columns=["share", "log_return"])
|
||||
)
|
||||
|
||||
result = merge(
|
||||
household.reset_index(),
|
||||
log_return.reset_index(),
|
||||
on=["asset_id"],
|
||||
how="outer",
|
||||
).set_index(["household_id", "asset_id", "t"])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestJoinMultiMulti:
|
||||
def test_join_multi_multi(self, left_multi, right_multi, join_type, on_cols_multi):
|
||||
left_names = left_multi.index.names
|
||||
right_names = right_multi.index.names
|
||||
if join_type == "right":
|
||||
level_order = right_names + left_names.difference(right_names)
|
||||
else:
|
||||
level_order = left_names + right_names.difference(left_names)
|
||||
# Multi-index join tests
|
||||
expected = (
|
||||
merge(
|
||||
left_multi.reset_index(),
|
||||
right_multi.reset_index(),
|
||||
how=join_type,
|
||||
on=on_cols_multi,
|
||||
)
|
||||
.set_index(level_order)
|
||||
.sort_index()
|
||||
)
|
||||
|
||||
result = left_multi.join(right_multi, how=join_type).sort_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_empty_frames(
|
||||
self, left_multi, right_multi, join_type, on_cols_multi
|
||||
):
|
||||
left_multi = left_multi.drop(columns=left_multi.columns)
|
||||
right_multi = right_multi.drop(columns=right_multi.columns)
|
||||
|
||||
left_names = left_multi.index.names
|
||||
right_names = right_multi.index.names
|
||||
if join_type == "right":
|
||||
level_order = right_names + left_names.difference(right_names)
|
||||
else:
|
||||
level_order = left_names + right_names.difference(left_names)
|
||||
|
||||
expected = (
|
||||
merge(
|
||||
left_multi.reset_index(),
|
||||
right_multi.reset_index(),
|
||||
how=join_type,
|
||||
on=on_cols_multi,
|
||||
)
|
||||
.set_index(level_order)
|
||||
.sort_index()
|
||||
)
|
||||
|
||||
result = left_multi.join(right_multi, how=join_type).sort_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("box", [None, np.asarray, Series, Index])
|
||||
def test_merge_datetime_index(self, box):
|
||||
# see gh-19038
|
||||
df = DataFrame(
|
||||
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
|
||||
)
|
||||
df.index = pd.to_datetime(df.index)
|
||||
on_vector = df.index.year
|
||||
|
||||
if box is not None:
|
||||
on_vector = box(on_vector)
|
||||
|
||||
exp_years = np.array([2016, 2017, 2018], dtype=np.int32)
|
||||
expected = DataFrame({"a": [1, 2, 3], "key_1": exp_years})
|
||||
|
||||
result = df.merge(df, on=["a", on_vector], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"key_0": exp_years, "a_x": [1, 2, 3], "a_y": [1, 2, 3]})
|
||||
|
||||
result = df.merge(df, on=[df.index.year], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_single_common_level(self):
|
||||
index_left = MultiIndex.from_tuples(
|
||||
[("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"]
|
||||
)
|
||||
|
||||
left = DataFrame(
|
||||
{"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=index_left
|
||||
)
|
||||
|
||||
index_right = MultiIndex.from_tuples(
|
||||
[("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"]
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
{"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]},
|
||||
index=index_right,
|
||||
)
|
||||
|
||||
result = left.join(right)
|
||||
expected = merge(
|
||||
left.reset_index(), right.reset_index(), on=["key"], how="inner"
|
||||
).set_index(["key", "X", "Y"])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_wrong_order(self):
|
||||
# GH 25760
|
||||
# GH 28956
|
||||
|
||||
midx1 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
|
||||
midx3 = MultiIndex.from_tuples([(4, 1), (3, 2), (3, 1)], names=["b", "a"])
|
||||
|
||||
left = DataFrame(index=midx1, data={"x": [10, 20, 30, 40]})
|
||||
right = DataFrame(index=midx3, data={"y": ["foo", "bar", "fing"]})
|
||||
|
||||
result = left.join(right)
|
||||
|
||||
expected = DataFrame(
|
||||
index=midx1,
|
||||
data={"x": [10, 20, 30, 40], "y": ["fing", "foo", "bar", np.nan]},
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
Reference in New Issue
Block a user