diff --git a/tests/frame/__init__.py b/tests/frame/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_frame.py b/tests/frame/test_frame.py similarity index 76% rename from tests/test_frame.py rename to tests/frame/test_frame.py index a316684c0..da83cb605 100644 --- a/tests/test_frame.py +++ b/tests/frame/test_frame.py @@ -2,10 +2,7 @@ from collections import ( OrderedDict, - UserDict, - UserList, defaultdict, - deque, ) from collections.abc import ( Callable, @@ -17,7 +14,6 @@ ) import csv import datetime -from enum import Enum import io import itertools from pathlib import Path @@ -31,11 +27,9 @@ TypeAlias, TypedDict, TypeVar, - cast, ) import numpy as np -import numpy.typing as npt import pandas as pd from pandas.api.typing import NAType from pandas.core.resample import ( @@ -79,7 +73,6 @@ if TYPE_CHECKING: from pandas.core.frame import _PandasNamedTuple - from pandas._typing import S1 else: _PandasNamedTuple: TypeAlias = tuple @@ -88,8 +81,6 @@ else: Pandas4Warning: TypeAlias = FutureWarning # type: ignore[no-redef] -DF = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) - def getCols(k: int) -> str: return string.ascii_uppercase[:k] @@ -99,7 +90,7 @@ def makeStringIndex(k: int = 10) -> pd.Index: return pd.Index(rands_array(nchars=10, size=k), name=None) -def rands_array(nchars: int, size: int) -> npt.NDArray[Any]: +def rands_array(nchars: int, size: int) -> np_ndarray: chars = np.array(list(string.ascii_letters + string.digits), dtype=(np.str_, 1)) retval = ( np.random.default_rng(2) @@ -247,109 +238,6 @@ def test_types_copy() -> None: check(assert_type(df.copy(), pd.DataFrame), pd.DataFrame) -def test_types_getitem() -> None: - df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4], 5: [6, 7]}) - i = pd.Index(["col1", "col2"]) - s = pd.Series(["col1", "col2"]) - select_df = pd.DataFrame({"col1": [True, True], "col2": [False, True]}) - a = np.array(["col1", "col2"]) - check(assert_type(df["col1"], pd.Series), pd.Series) - check(assert_type(df[5], pd.Series), pd.Series) - check(assert_type(df[["col1", "col2"]], pd.DataFrame), pd.DataFrame) - check(assert_type(df[1:], pd.DataFrame), pd.DataFrame) - check(assert_type(df[s], pd.DataFrame), pd.DataFrame) - check(assert_type(df[a], pd.DataFrame), pd.DataFrame) - check(assert_type(df[select_df], pd.DataFrame), pd.DataFrame) - check(assert_type(df[i], pd.DataFrame), pd.DataFrame) - - -def test_types_getitem_with_hashable() -> None: - # Testing getitem support for hashable types that are not scalar - # Due to the bug in https://github.com/pandas-dev/pandas-stubs/issues/592 - class MyEnum(Enum): - FIRST = "tayyar" - SECOND = "haydar" - - df = pd.DataFrame( - data=[[12.2, 10], [8.8, 15]], columns=[MyEnum.FIRST, MyEnum.SECOND] - ) - check(assert_type(df[MyEnum.FIRST], pd.Series), pd.Series) - check(assert_type(df[1:], pd.DataFrame), pd.DataFrame) - check(assert_type(df[:2], pd.DataFrame), pd.DataFrame) - - df2 = pd.DataFrame(data=[[12.2, 10], [8.8, 15]], columns=[3, 4]) - check(assert_type(df2[3], pd.Series), pd.Series) - check(assert_type(df2[[3]], pd.DataFrame), pd.DataFrame) - check(assert_type(df2[[3, 4]], pd.DataFrame), pd.DataFrame) - - -def test_slice_setitem() -> None: - df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4], 5: [6, 7]}) - df[1:] = [10, 11, 12] - - -def test_types_setitem() -> None: - df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4], 5: [6, 7]}) - h = cast(Hashable, "col1") - i = pd.Index(["col1", "col2"]) - s = pd.Series(["col1", "col2"]) - a = np.array(["col1", "col2"]) - df["col1"] = [1, 2] - df[5] = [5, 6] - df[h] = [5, 6] - df.loc[:, h] = [5, 6] - df.loc[:, UserList([h])] = [[5], [6]] - df.loc[:, iter([h])] = [[5], [6]] - df[["col1", "col2"]] = [[1, 2], [3, 4]] - df[s] = [5, 6] - df.loc[:, s] = [5, 6] - df["col1"] = [5, 6] - df[df["col1"] > 1] = [5, 6, 7] - df[a] = [[1, 2], [3, 4]] - df[i] = [8, 9] - - -def test_types_setitem_mask() -> None: - df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4], 5: [6, 7]}) - select_df = pd.DataFrame({"col1": [True, True], "col2": [False, True]}) - df[select_df] = [1, 2, 3] - - -def test_types_iloc_iat() -> None: - df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) - check(assert_type(df.iloc[1, 1], Scalar), np.integer) - check(assert_type(df.iloc[[1], [1]], pd.DataFrame), pd.DataFrame) - - check(assert_type(df.iat[0, 0], Scalar), np.integer) - - # https://github.com/microsoft/python-type-stubs/issues/31 - check(assert_type(df.iloc[:, [0]], pd.DataFrame), pd.DataFrame) - check(assert_type(df.iloc[:, 0], pd.Series), pd.Series) - - -def test_types_loc_at() -> None: - df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) - check(assert_type(df.loc[[0], "col1"], pd.Series), pd.Series) - check(assert_type(df.loc[0, "col1"], Scalar), np.integer) - - check(assert_type(df.at[0, "col1"], Scalar), np.integer) - - -def test_types_boolean_indexing() -> None: - df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) - check(assert_type(df[df > 1], pd.DataFrame), pd.DataFrame) - check(assert_type(df[~(df > 1.0)], pd.DataFrame), pd.DataFrame) - - row_mask = df["col1"] >= 2 - col_mask = df.columns.isin(["col2"]) - check(assert_type(df.loc[row_mask], pd.DataFrame), pd.DataFrame) - check(assert_type(df.loc[~row_mask], pd.DataFrame), pd.DataFrame) - check(assert_type(df.loc[row_mask, :], pd.DataFrame), pd.DataFrame) - check(assert_type(df.loc[:, col_mask], pd.DataFrame), pd.DataFrame) - check(assert_type(df.loc[row_mask, col_mask], pd.DataFrame), pd.DataFrame) - check(assert_type(df.loc[~row_mask, ~col_mask], pd.DataFrame), pd.DataFrame) - - def test_types_df_to_df_comparison() -> None: df = pd.DataFrame(data={"col1": [1, 2]}) df2 = pd.DataFrame(data={"col1": [3, 2]}) @@ -1673,318 +1561,6 @@ def test_pivot_table_sort() -> None: ) -def test_types_groupby_as_index() -> None: - """Test type of groupby.size method depending on `as_index`.""" - df = pd.DataFrame({"a": [1, 2, 3]}) - check( - assert_type( - df.groupby("a", as_index=False).size(), - pd.DataFrame, - ), - pd.DataFrame, - ) - check( - assert_type( - df.groupby("a", as_index=True).size(), - "pd.Series[int]", - ), - pd.Series, - ) - check( - assert_type( - df.groupby("a").size(), - "pd.Series[int]", - ), - pd.Series, - ) - - -def test_types_groupby_as_index_list() -> None: - """Test type of groupby.size method depending on list of grouper GH1045.""" - df = pd.DataFrame({"a": [1, 1, 2], "b": [2, 3, 2]}) - check( - assert_type( - df.groupby(["a", "b"], as_index=False).size(), - pd.DataFrame, - ), - pd.DataFrame, - ) - check( - assert_type( - df.groupby(["a", "b"], as_index=True).size(), - "pd.Series[int]", - ), - pd.Series, - ) - check( - assert_type( - df.groupby(["a", "b"]).size(), - "pd.Series[int]", - ), - pd.Series, - ) - - -def test_types_groupby_as_index_value_counts() -> None: - """Test type of groupby.value_counts method depending on `as_index`.""" - df = pd.DataFrame({"a": [1, 2, 3]}) - check( - assert_type( - df.groupby("a", as_index=False).value_counts(), - pd.DataFrame, - ), - pd.DataFrame, - ) - check( - assert_type( - df.groupby("a", as_index=True).value_counts(), - "pd.Series[int]", - ), - pd.Series, - ) - - -def test_types_groupby_size() -> None: - """Test for GH886.""" - data = [ - {"date": "2023-12-01", "val": 12}, - {"date": "2023-12-02", "val": 2}, - {"date": "2023-12-03", "val": 1}, - {"date": "2023-12-03", "val": 10}, - ] - - df = pd.DataFrame(data) - groupby = df.groupby("date") - size = groupby.size() - frame = size.to_frame() - check(assert_type(frame.reset_index(), pd.DataFrame), pd.DataFrame) - - -def test_types_groupby() -> None: - df = pd.DataFrame(data={"col1": [1, 1, 2], "col2": [3, 4, 5], "col3": [0, 1, 0]}) - df.index.name = "ind" - df.groupby(by="col1") - df.groupby(level="ind") - df.groupby(by="col1", sort=False, as_index=True) - df.groupby(by=["col1", "col2"]) - # GH 284 - df.groupby(df["col1"] > 2) - df.groupby([df["col1"] > 2, df["col2"] % 2 == 1]) - df.groupby(lambda x: x) - df.groupby([lambda x: x % 2, lambda x: x % 3]) - df.groupby(np.array([1, 0, 1])) - df.groupby([np.array([1, 0, 0]), np.array([0, 0, 1])]) - df.groupby({1: 1, 2: 2, 3: 3}) - df.groupby([{1: 1, 2: 1, 3: 2}, {1: 1, 2: 2, 3: 2}]) - df.groupby(df.index) - df.groupby([pd.Index([1, 0, 0]), pd.Index([0, 0, 1])]) - df.groupby(pd.Grouper(level=0)) - df.groupby([pd.Grouper(level=0), pd.Grouper(key="col1")]) - - check(assert_type(df.groupby(by="col1").agg("sum"), pd.DataFrame), pd.DataFrame) - check( - assert_type(df.groupby(level="ind").aggregate("sum"), pd.DataFrame), - pd.DataFrame, - ) - check( - assert_type( - df.groupby(by="col1", sort=False, as_index=True).transform( - lambda x: x.max() - ), - pd.DataFrame, - ), - pd.DataFrame, - ) - check( - assert_type(df.groupby(by=["col1", "col2"]).count(), pd.DataFrame), pd.DataFrame - ) - check( - assert_type( - df.groupby(by=["col1", "col2"]).filter(lambda x: x["col1"] > 0), - pd.DataFrame, - ), - pd.DataFrame, - ) - check( - assert_type(df.groupby(by=["col1", "col2"]).nunique(), pd.DataFrame), - pd.DataFrame, - ) - with pytest_warns_bounded( - FutureWarning, - "(The provided callable is currently using|The behavior of DataFrame.sum with)", - upper="2.3.99", - ): - with pytest_warns_bounded( - FutureWarning, - "DataFrameGroupBy.apply operated on the grouping columns", - upper="2.3.99", - ): - if PD_LTE_23: - check( - assert_type(df.groupby(by="col1").apply(sum), pd.DataFrame), - pd.DataFrame, - ) - check(assert_type(df.groupby("col1").transform("sum"), pd.DataFrame), pd.DataFrame) - s1 = df.set_index("col1")["col2"] - check(assert_type(s1, pd.Series), pd.Series) - check(assert_type(s1.groupby("col1").transform("sum"), pd.Series), pd.Series) - - -def test_types_groupby_methods() -> None: - df = pd.DataFrame(data={"col1": [1, 1, 2], "col2": [3, 4, 5], "col3": [0, 1, 0]}) - check(assert_type(df.groupby("col1").sum(), pd.DataFrame), pd.DataFrame) - check(assert_type(df.groupby("col1").prod(), pd.DataFrame), pd.DataFrame) - check(assert_type(df.groupby("col1").sample(), pd.DataFrame), pd.DataFrame) - check(assert_type(df.groupby("col1").count(), pd.DataFrame), pd.DataFrame) - check( - assert_type(df.groupby("col1").value_counts(normalize=False), "pd.Series[int]"), - pd.Series, - np.integer, - ) - check( - assert_type( - df.groupby("col1").value_counts(subset=None, normalize=True), - "pd.Series[float]", - ), - pd.Series, - float, - ) - check(assert_type(df.groupby("col1").idxmax(), pd.DataFrame), pd.DataFrame) - check(assert_type(df.groupby("col1").idxmin(), pd.DataFrame), pd.DataFrame) - - -def test_types_groupby_agg() -> None: - df = pd.DataFrame( - data={"col1": [1, 1, 2], "col2": [3, 4, 5], "col3": [0, 1, 0], 0: [-1, -1, -1]} - ) - check(assert_type(df.groupby("col1").agg("min"), pd.DataFrame), pd.DataFrame) - check( - assert_type(df.groupby("col1").agg(["min", "max"]), pd.DataFrame), pd.DataFrame - ) - agg_dict1 = {"col2": "min", "col3": "max", 0: "sum"} - check(assert_type(df.groupby("col1").agg(agg_dict1), pd.DataFrame), pd.DataFrame) - - def wrapped_min(x: pd.Series[S1]) -> S1: - return x.min() - - with pytest_warns_bounded( - FutureWarning, - r"The provided callable is currently using", - upper="2.3.99", - ): - check(assert_type(df.groupby("col1")["col3"].agg(min), pd.Series), pd.Series) - check( - assert_type(df.groupby("col1")["col3"].agg([min, max]), pd.DataFrame), - pd.DataFrame, - ) - check(assert_type(df.groupby("col1").agg(min), pd.DataFrame), pd.DataFrame) - check( - assert_type(df.groupby("col1").agg([min, max]), pd.DataFrame), pd.DataFrame - ) - agg_dict2 = {"col2": min, "col3": max, 0: min} - check( - assert_type(df.groupby("col1").agg(agg_dict2), pd.DataFrame), pd.DataFrame - ) - - # Here, MyPy infers dict[object, object], so it must be explicitly annotated - agg_dict3: dict[str | int, str | Callable[..., Any]] = { - "col2": min, - "col3": "max", - 0: wrapped_min, - } - check( - assert_type(df.groupby("col1").agg(agg_dict3), pd.DataFrame), pd.DataFrame - ) - agg_dict4 = {"col2": "sum"} - check(assert_type(df.groupby("col1").agg(agg_dict4), pd.DataFrame), pd.DataFrame) - agg_dict5 = {0: "sum"} - check(assert_type(df.groupby("col1").agg(agg_dict5), pd.DataFrame), pd.DataFrame) - named_agg = pd.NamedAgg(column="col2", aggfunc="max") - check( - assert_type(df.groupby("col1").agg(new_col=named_agg), pd.DataFrame), - pd.DataFrame, - ) - # GH#187 - cols: list[str] = ["col1", "col2"] - check(assert_type(df.groupby(by=cols).sum(), pd.DataFrame), pd.DataFrame) - - cols_opt: list[str | None] = ["col1", "col2"] - check(assert_type(df.groupby(by=cols_opt).sum(), pd.DataFrame), pd.DataFrame) - - cols_mixed: list[str | int] = ["col1", 0] - check(assert_type(df.groupby(by=cols_mixed).sum(), pd.DataFrame), pd.DataFrame) - # GH 736 - check(assert_type(df.groupby(by="col1").aggregate("size"), pd.Series), pd.Series) - check(assert_type(df.groupby(by="col1").agg("size"), pd.Series), pd.Series) - - -# This was added in 1.1.0 https://pandas.pydata.org/docs/whatsnew/v1.1.0.html -def test_types_group_by_with_dropna_keyword() -> None: - df = pd.DataFrame( - data={"col1": [1, 1, 2, 1], "col2": [2, None, 1, 2], "col3": [3, 4, 3, 2]} - ) - check( - assert_type(df.groupby(by="col2", dropna=True).sum(), pd.DataFrame), - pd.DataFrame, - ) - check( - assert_type(df.groupby(by="col2", dropna=False).sum(), pd.DataFrame), - pd.DataFrame, - ) - check(assert_type(df.groupby(by="col2").sum(), pd.DataFrame), pd.DataFrame) - - -def test_types_groupby_any() -> None: - df = pd.DataFrame( - data={ - "col1": [1, 1, 2], - "col2": [True, False, False], - "col3": [False, False, False], - } - ) - check(assert_type(df.groupby("col1").any(), pd.DataFrame), pd.DataFrame) - check(assert_type(df.groupby("col1").all(), pd.DataFrame), pd.DataFrame) - check( - assert_type(df.groupby("col1")["col2"].any(), "pd.Series[bool]"), - pd.Series, - np.bool_, - ) - check( - assert_type(df.groupby("col1")["col2"].any(), "pd.Series[bool]"), - pd.Series, - np.bool_, - ) - - -def test_types_groupby_iter() -> None: - df = pd.DataFrame(data={"col1": [1, 1, 2], "col2": [3, 4, 5]}) - series_groupby = pd.Series([True, True, False], dtype=bool) - first_group = next(iter(df.groupby(series_groupby))) - check( - assert_type(first_group[0], bool), - bool, - ) - check( - assert_type(first_group[1], pd.DataFrame), - pd.DataFrame, - ) - - -def test_types_groupby_level() -> None: - # GH 836 - data = { - "col1": [0, 0, 0], - "col2": [0, 1, 0], - "col3": [1, 2, 3], - "col4": [1, 2, 3], - } - df = pd.DataFrame(data=data).set_index(["col1", "col2", "col3"]) - check( - assert_type(df.groupby(level=["col1", "col2"]).sum(), pd.DataFrame), - pd.DataFrame, - ) - - def test_types_merge() -> None: df = pd.DataFrame(data={"col1": [1, 1, 2], "col2": [3, 4, 5]}) df2 = pd.DataFrame(data={"col1": [1, 1, 2], "col2": [0, 1, 0]}) @@ -2942,32 +2518,6 @@ class ReadCsvKwargs(TypedDict): ) -def test_groupby_series_methods() -> None: - df = pd.DataFrame({"x": [1, 2, 2, 3, 3], "y": [10, 20, 30, 40, 50]}) - gb = df.groupby("x")["y"] - check(assert_type(gb.describe(), pd.DataFrame), pd.DataFrame) - check(assert_type(gb.count().loc[2], int), np.integer) - check(assert_type(gb.pct_change(), pd.Series), pd.Series) - check(assert_type(gb.bfill(), pd.Series), pd.Series) - check(assert_type(gb.cummax(), pd.Series), pd.Series) - check(assert_type(gb.cummin(), pd.Series), pd.Series) - check(assert_type(gb.cumprod(), pd.Series), pd.Series) - check(assert_type(gb.cumsum(), pd.Series), pd.Series) - check(assert_type(gb.ffill(), pd.Series), pd.Series) - check(assert_type(gb.first(), pd.Series), pd.Series) - check(assert_type(gb.head(), pd.Series), pd.Series) - check(assert_type(gb.last(), pd.Series), pd.Series) - check(assert_type(gb.max(), pd.Series), pd.Series) - check(assert_type(gb.mean(), pd.Series), pd.Series) - check(assert_type(gb.median(), pd.Series), pd.Series) - check(assert_type(gb.min(), pd.Series), pd.Series) - check(assert_type(gb.nlargest(), pd.Series), pd.Series) - check(assert_type(gb.nsmallest(), pd.Series), pd.Series) - check(assert_type(gb.nth(0), pd.DataFrame | pd.Series), pd.Series) - check(assert_type(gb.nth[0, 1, 2], pd.DataFrame | pd.Series), pd.Series) - check(assert_type(gb.nth((0, 1, 2)), pd.DataFrame | pd.Series), pd.Series) - - def test_dataframe_pct_change() -> None: df = pd.DataFrame({"x": [1, 2, 2, 3, 3], "y": [10, 20, 30, 40, 50]}) check(assert_type(df.pct_change(), pd.DataFrame), pd.DataFrame) @@ -2983,49 +2533,6 @@ def test_dataframe_pct_change() -> None: check(assert_type(df.pct_change(axis="index"), pd.DataFrame), pd.DataFrame) -def test_indexslice_setitem() -> None: - df = pd.DataFrame( - {"x": [1, 2, 2, 3], "y": [1, 2, 3, 4], "z": [10, 20, 30, 40]} - ).set_index(["x", "y"]) - s = pd.Series([-1, -2]) - df.loc[pd.IndexSlice[2, :]] = s.values - df.loc[pd.IndexSlice[2, :], "z"] = [200, 300] - # GH 314 - df.loc[pd.IndexSlice[pd.Index([2, 3]), :], "z"] = 99 - - -def test_indexslice_getitem() -> None: - # GH 300 - df = ( - pd.DataFrame({"x": [1, 2, 2, 3, 4], "y": [10, 20, 30, 40, 10]}) - .assign(z=lambda df: df.x * df.y) - .set_index(["x", "y"]) - ) - ind = pd.Index([2, 3]) - check( - assert_type( - pd.IndexSlice[ind, :], tuple["pd.Index[int]", "slice[None, None, None]"] - ), - tuple, - ) - check(assert_type(df.loc[pd.IndexSlice[ind, :]], pd.DataFrame), pd.DataFrame) - check(assert_type(df.loc[pd.IndexSlice[1:2]], pd.DataFrame), pd.DataFrame) - check( - assert_type(df.loc[pd.IndexSlice[:, df["z"] > 40], :], pd.DataFrame), - pd.DataFrame, - ) - check(assert_type(df.loc[pd.IndexSlice[2, 30], "z"], Scalar), np.integer) - check( - assert_type(df.loc[pd.IndexSlice[[2, 4], [20, 40]], :], pd.DataFrame), - pd.DataFrame, - ) - # GH 314 - check( - assert_type(df.loc[pd.IndexSlice[pd.Index([2, 4]), :], "z"], pd.Series), - pd.Series, - ) - - def test_compute_values() -> None: df = pd.DataFrame({"x": [1, 2, 3, 4]}) s: pd.Series = pd.Series([10, 20, 30, 40]) @@ -3045,49 +2552,6 @@ def test_sum_get_add() -> None: check(assert_type(summer + summer, pd.Series), pd.Series) -def test_getset_untyped() -> None: - """Test that Dataframe.__getitem__ needs to return untyped series.""" - df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [10, 20, 30, 40, 50]}) - check(assert_type(df["x"].max(), Any), np.integer) - - -def test_getmultiindex_columns() -> None: - mi = pd.MultiIndex.from_product([[1, 2], ["a", "b"]]) - df = pd.DataFrame([[1, 2, 3, 4], [10, 20, 30, 40]], columns=mi) - li: list[tuple[int, str]] = [(1, "a"), (2, "b")] - check(assert_type(df[[(1, "a"), (2, "b")]], pd.DataFrame), pd.DataFrame) - check(assert_type(df[li], pd.DataFrame), pd.DataFrame) - check( - assert_type( - df[[(i, s) for i in [1] for s in df.columns.get_level_values(1)]], - pd.DataFrame, - ), - pd.DataFrame, - ) - check(assert_type(df[[df.columns[0]]], pd.DataFrame), pd.DataFrame) - check(assert_type(df[df.columns[0]], pd.Series), pd.Series) - check(assert_type(df[li[0]], pd.Series), pd.Series) - - -def test_frame_isin() -> None: - df = pd.DataFrame({"x": [1, 2, 3, 4, 5]}, index=[1, 2, 3, 4, 5]) - check(assert_type(df.isin([1, 3, 5]), pd.DataFrame), pd.DataFrame) - check(assert_type(df.isin({1, 3, 5}), pd.DataFrame), pd.DataFrame) - check(assert_type(df.isin(pd.Series([1, 3, 5])), pd.DataFrame), pd.DataFrame) - check(assert_type(df.isin(pd.Index([1, 3, 5])), pd.DataFrame), pd.DataFrame) - check(assert_type(df.isin(df), pd.DataFrame), pd.DataFrame) - check(assert_type(df.isin({"x": [1, 2]}), pd.DataFrame), pd.DataFrame) - check( - assert_type(df.isin(UserDict({"x": iter([1, "2"])})), pd.DataFrame), - pd.DataFrame, - ) - - -def test_frame_getitem_isin() -> None: - df = pd.DataFrame({"x": [1, 2, 3, 4, 5]}, index=[1, 2, 3, 4, 5]) - check(assert_type(df[df.index.isin([1, 3, 5])], pd.DataFrame), pd.DataFrame) - - def test_to_excel() -> None: df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) @@ -3330,27 +2794,6 @@ def test_loop_dataframe() -> None: check(assert_type(df[c], pd.Series), pd.Series) -def test_groupby_index() -> None: - # GH 42 - df = pd.DataFrame( - data={"col1": [1, 1, 2], "col2": [3, 4, 5], "col3": [0, 1, 0]} - ).set_index("col1") - check(assert_type(df.groupby(df.index).min(), pd.DataFrame), pd.DataFrame) - - -def test_iloc_npint() -> None: - # GH 69 - df = pd.DataFrame({"a": [10, 20, 30], "b": [20, 40, 60], "c": [30, 60, 90]}) - iloc = np.argmin(np.random.standard_normal(3)) - df.iloc[iloc] - - -# https://github.com/pandas-dev/pandas-stubs/issues/143 -def test_iloc_tuple() -> None: - df = pd.DataFrame({"Char": ["A", "B", "C"], "Number": [1, 2, 3]}) - df = df.iloc[0:2,] - - def test_take() -> None: df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) check(assert_type(df.take([0, 1]), pd.DataFrame), pd.DataFrame) @@ -3476,15 +2919,6 @@ def test_frame_reindex_like() -> None: ) -def test_frame_ndarray_assignmment() -> None: - # GH 100 - df_a = pd.DataFrame({"a": [0.0] * 10}) - df_a.iloc[:, :] = np.array([[-1.0]] * 10) - - df_b = pd.DataFrame({"a": [0.0] * 10, "b": [1.0] * 10}) - df_b.iloc[:, :] = np.array([[-1.0, np.inf]] * 10) - - def test_not_hashable() -> None: # GH 113 check(assert_type(pd.DataFrame.__hash__, None), type(None)) @@ -3503,338 +2937,6 @@ def test_func(_: Hashable) -> None: test_func(pd.Index([])) # type: ignore[arg-type] # pyright: ignore[reportArgumentType] -def test_columns_mixlist() -> None: - # GH 97 - df = pd.DataFrame({"a": [1, 2, 3], 1: [3, 4, 5]}) - key: list[int | str] - key = [1] - check(assert_type(df[key], pd.DataFrame), pd.DataFrame) - - -def test_frame_scalars_slice() -> None: - # GH 133 - # scalars: - # str, bytes, datetime.date, datetime.datetime, datetime.timedelta, bool, int, - # float, complex, Timestamp, Timedelta - - str_ = "a" - bytes_ = b"7" - date = datetime.date(1999, 12, 31) - datetime_ = datetime.datetime(1999, 12, 31) - timedelta = datetime.datetime(2000, 1, 1) - datetime.datetime(1999, 12, 31) - bool_ = True - int_ = 2 - float_ = 3.14 - complex_ = 1.0 + 3.0j - timestamp = pd.Timestamp(0) - pd_timedelta = pd.Timedelta(0, unit="D") - none = None - idx = [ - str_, - bytes_, - date, - datetime_, - timedelta, - bool_, - int_, - float_, - complex_, - timestamp, - pd_timedelta, - none, - ] - values = np.arange(len(idx))[:, None] + np.arange(len(idx)) - df = pd.DataFrame(values, columns=idx, index=idx) - - # Note: bool_ cannot be tested since the index is object and pandas does not - # support boolean access using loc except when the index is boolean - check(assert_type(df.loc[str_], pd.Series | pd.DataFrame), pd.Series) - check(assert_type(df.loc[bytes_], pd.Series | pd.DataFrame), pd.Series) - check(assert_type(df.loc[date], pd.Series | pd.DataFrame), pd.Series) - check(assert_type(df.loc[datetime_], pd.Series | pd.DataFrame), pd.Series) - check(assert_type(df.loc[timedelta], pd.Series | pd.DataFrame), pd.Series) - check(assert_type(df.loc[int_], pd.Series | pd.DataFrame), pd.Series) - check(assert_type(df.loc[float_], pd.Series | pd.DataFrame), pd.Series) - check(assert_type(df.loc[complex_], pd.Series | pd.DataFrame), pd.Series) - check(assert_type(df.loc[timestamp], pd.Series | pd.DataFrame), pd.Series) - check(assert_type(df.loc[pd_timedelta], pd.Series | pd.DataFrame), pd.Series) - check(assert_type(df.loc[none], pd.Series), pd.Series) - - check(assert_type(df.loc[:, str_], pd.Series), pd.Series) - check(assert_type(df.loc[:, bytes_], pd.Series), pd.Series) - check(assert_type(df.loc[:, date], pd.Series), pd.Series) - check(assert_type(df.loc[:, datetime_], pd.Series), pd.Series) - check(assert_type(df.loc[:, timedelta], pd.Series), pd.Series) - check(assert_type(df.loc[:, int_], pd.Series), pd.Series) - check(assert_type(df.loc[:, float_], pd.Series), pd.Series) - check(assert_type(df.loc[:, complex_], pd.Series), pd.Series) - check(assert_type(df.loc[:, timestamp], pd.Series), pd.Series) - check(assert_type(df.loc[:, pd_timedelta], pd.Series), pd.Series) - check(assert_type(df.loc[:, none], pd.Series), pd.Series) - - # GH749 - - multi_idx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=["alpha", "num"]) - df2 = pd.DataFrame({"col1": range(4)}, index=multi_idx) - check(assert_type(df2.loc[str_], pd.Series | pd.DataFrame), pd.DataFrame) - - df3 = pd.DataFrame({"x": range(2)}, index=pd.Index(["a", "b"])) - check(assert_type(df3.loc[str_], pd.Series | pd.DataFrame), pd.Series) - - # https://github.com/microsoft/python-type-stubs/issues/62 - df7 = pd.DataFrame({"x": [1, 2, 3]}, index=pd.Index(["a", "b", "c"])) - index = pd.Index(["b"]) - check(assert_type(df7.loc[index], pd.DataFrame), pd.DataFrame) - - -def test_boolean_loc() -> None: - # Booleans can only be used in loc when the index is boolean - df = pd.DataFrame([[0, 1], [1, 0]], columns=[True, False], index=[True, False]) - check(assert_type(df.loc[True], pd.Series | pd.DataFrame), pd.Series) - check(assert_type(df.loc[:, False], pd.Series), pd.Series) - - -def test_groupby_result() -> None: - # GH 142 - df = pd.DataFrame({"a": [0, 1, 2], "b": [4, 5, 6], "c": [7, 8, 9]}) - iterator = df.groupby(["a", "b"]).__iter__() - assert_type(iterator, Iterator[tuple[tuple[Hashable, ...], pd.DataFrame]]) - index, value = next(iterator) - assert_type((index, value), tuple[tuple[Hashable, ...], pd.DataFrame]) - - if PD_LTE_23: - check(assert_type(index, tuple[Hashable, ...]), tuple, np.integer) - else: - check(assert_type(index, tuple[Hashable, ...]), tuple, int) - - check(assert_type(value, pd.DataFrame), pd.DataFrame) - - iterator2 = df.groupby("a").__iter__() - assert_type(iterator2, Iterator[tuple[Scalar, pd.DataFrame]]) - index2, value2 = next(iterator2) - assert_type((index2, value2), tuple[Scalar, pd.DataFrame]) - - check(assert_type(index2, Scalar), int) - check(assert_type(value2, pd.DataFrame), pd.DataFrame) - - # GH 674 - # grouping by pd.MultiIndex should always resolve to a tuple as well - multi_index = pd.MultiIndex.from_frame(df[["a", "b"]]) - iterator3 = df.groupby(multi_index).__iter__() - assert_type(iterator3, Iterator[tuple[tuple[Hashable, ...], pd.DataFrame]]) - index3, value3 = next(iterator3) - assert_type((index3, value3), tuple[tuple[Hashable, ...], pd.DataFrame]) - - check(assert_type(index3, tuple[Hashable, ...]), tuple, int) - check(assert_type(value3, pd.DataFrame), pd.DataFrame) - - # Want to make sure these cases are differentiated - for (_k1, _k2), _g in df.groupby(["a", "b"]): - pass - - for _kk, _g in df.groupby("a"): - pass - - for (_k1, _k2), _g in df.groupby(multi_index): - pass - - -def test_groupby_result_for_scalar_indexes() -> None: - # GH 674 - dates = pd.date_range("2020-01-01", "2020-12-31") - df = pd.DataFrame({"date": dates, "days": 1}) - period_index = pd.PeriodIndex(df.date, freq="M") - iterator = df.groupby(period_index).__iter__() - assert_type(iterator, Iterator[tuple[pd.Period, pd.DataFrame]]) - index, value = next(iterator) - assert_type((index, value), tuple[pd.Period, pd.DataFrame]) - - check(assert_type(index, pd.Period), pd.Period) - check(assert_type(value, pd.DataFrame), pd.DataFrame) - - dt_index = pd.DatetimeIndex(dates) - iterator2 = df.groupby(dt_index).__iter__() - assert_type(iterator2, Iterator[tuple[pd.Timestamp, pd.DataFrame]]) - index2, value2 = next(iterator2) - assert_type((index2, value2), tuple[pd.Timestamp, pd.DataFrame]) - - check(assert_type(index2, pd.Timestamp), pd.Timestamp) - check(assert_type(value2, pd.DataFrame), pd.DataFrame) - - tdelta_index = pd.TimedeltaIndex(dates - pd.Timestamp("2020-01-01")) - iterator3 = df.groupby(tdelta_index).__iter__() - assert_type(iterator3, Iterator[tuple[pd.Timedelta, pd.DataFrame]]) - index3, value3 = next(iterator3) - assert_type((index3, value3), tuple[pd.Timedelta, pd.DataFrame]) - - check(assert_type(index3, pd.Timedelta), pd.Timedelta) - check(assert_type(value3, pd.DataFrame), pd.DataFrame) - - intervals: list[pd.Interval[pd.Timestamp]] = [ - pd.Interval(date, date + pd.DateOffset(days=1), closed="left") for date in dates - ] - interval_index = pd.IntervalIndex(intervals) - assert_type(interval_index, "pd.IntervalIndex[pd.Interval[pd.Timestamp]]") - iterator4 = df.groupby(interval_index).__iter__() - assert_type(iterator4, Iterator[tuple["pd.Interval[pd.Timestamp]", pd.DataFrame]]) - index4, value4 = next(iterator4) - assert_type((index4, value4), tuple["pd.Interval[pd.Timestamp]", pd.DataFrame]) - - check(assert_type(index4, "pd.Interval[pd.Timestamp]"), pd.Interval) - check(assert_type(value4, pd.DataFrame), pd.DataFrame) - - for _p, _g in df.groupby(period_index): - pass - - for _dt, _g in df.groupby(dt_index): - pass - - for _tdelta, _g in df.groupby(tdelta_index): - pass - - for _interval, _g in df.groupby(interval_index): - pass - - -def test_groupby_result_for_ambiguous_indexes() -> None: - # GH 674 - df = pd.DataFrame({"a": [0, 1, 2], "b": [4, 5, 6], "c": [7, 8, 9]}) - # this will use pd.Index which is ambiguous - iterator = df.groupby(df.index).__iter__() - assert_type(iterator, Iterator[tuple[Any, pd.DataFrame]]) - index, value = next(iterator) - assert_type((index, value), tuple[Any, pd.DataFrame]) - - check(assert_type(index, Any), int) - check(assert_type(value, pd.DataFrame), pd.DataFrame) - - # categorical indexes are also ambiguous - - # https://github.com/pandas-dev/pandas/issues/54054 needs to be fixed - with pytest_warns_bounded( - FutureWarning, - "The default of observed=False is deprecated", - upper="2.3.99", - ): - categorical_index = pd.CategoricalIndex(df.a) - iterator2 = df.groupby(categorical_index).__iter__() - assert_type(iterator2, Iterator[tuple[Any, pd.DataFrame]]) - index2, value2 = next(iterator2) - assert_type((index2, value2), tuple[Any, pd.DataFrame]) - - check(assert_type(index2, Any), int) - check(assert_type(value2, pd.DataFrame), pd.DataFrame) - - -def test_setitem_list() -> None: - # GH 153 - lst1: list[str] = ["a", "b", "c"] - lst2: list[int] = [1, 2, 3] - lst3: list[float] = [4.0, 5.0, 6.0] - lst4: list[tuple[str, int]] = [("a", 1), ("b", 2), ("c", 3)] - lst5: list[complex] = [0 + 1j, 0 + 2j, 0 + 3j] - - columns: list[Hashable] = [ - "a", - "b", - "c", - 1, - 2, - 3, - 4.0, - 5.0, - 6.0, - ("a", 1), - ("b", 2), - ("c", 3), - 0 + 1j, - 0 + 2j, - 0 + 3j, - ] - - df = pd.DataFrame(np.empty((3, 15)), columns=columns) - - check(assert_type(df.set_index(lst1), pd.DataFrame), pd.DataFrame) - check(assert_type(df.set_index(lst2), pd.DataFrame), pd.DataFrame) - check(assert_type(df.set_index(lst3), pd.DataFrame), pd.DataFrame) - check(assert_type(df.set_index(lst4), pd.DataFrame), pd.DataFrame) - check(assert_type(df.set_index(lst5), pd.DataFrame), pd.DataFrame) - - iter1: Iterator[str] = (v for v in lst1) - iter2: Iterator[tuple[str, int]] = (v for v in lst4) - check(assert_type(df.set_index(iter1), pd.DataFrame), pd.DataFrame) - check(assert_type(df.set_index(iter2), pd.DataFrame), pd.DataFrame) - - -def test_groupby_apply() -> None: - # GH 167 - df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]}) - - def sum_mean(x: pd.DataFrame) -> float: - return x.sum().mean() - - with pytest_warns_bounded( - FutureWarning, - "DataFrameGroupBy.apply operated on the grouping columns.", - lower="2.2.99", - upper="2.99", - ): - check( - assert_type(df.groupby("col1").apply(sum_mean), pd.Series), - pd.Series, - ) - - lfunc: Callable[[pd.DataFrame], float] = lambda x: x.sum().mean() - with pytest_warns_bounded( - FutureWarning, - "DataFrameGroupBy.apply operated on the grouping columns.", - lower="2.2.99", - upper="2.99", - ): - check(assert_type(df.groupby("col1").apply(lfunc), pd.Series), pd.Series) - - def sum_to_list(x: pd.DataFrame) -> list: - return x.sum().tolist() - - with pytest_warns_bounded( - FutureWarning, - "DataFrameGroupBy.apply operated on the grouping columns.", - lower="2.2.99", - upper="2.99", - ): - check(assert_type(df.groupby("col1").apply(sum_to_list), pd.Series), pd.Series) - - def sum_to_series(x: pd.DataFrame) -> pd.Series: - return x.sum() - - with pytest_warns_bounded( - FutureWarning, - "DataFrameGroupBy.apply operated on the grouping columns.", - lower="2.2.99", - upper="2.99", - ): - check( - assert_type(df.groupby("col1").apply(sum_to_series), pd.DataFrame), - pd.DataFrame, - ) - - def sample_to_df(x: pd.DataFrame) -> pd.DataFrame: - return x.sample() - - with pytest_warns_bounded( - FutureWarning, - "DataFrameGroupBy.apply operated on the grouping columns.", - lower="2.2.99", - upper="2.99", - ): - check( - assert_type( - df.groupby("col1", group_keys=False).apply(sample_to_df), pd.DataFrame - ), - pd.DataFrame, - ) - - def test_resample() -> None: # GH 181 N = 10 @@ -3872,45 +2974,6 @@ def test_squeeze() -> None: check(assert_type(df4.squeeze(), pd.DataFrame | pd.Series | Scalar), np.integer) -def test_loc_set() -> None: - df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) - df.loc["a"] = [3, 4] - - -def test_loc_int_set() -> None: - df = pd.DataFrame({1: [1, 2], 2: [3, 4]}) - df.loc[1] = [3, 4] - df.loc[np.int_(1)] = pd.Series([1, 2]) - df.loc[np.uint(1)] = pd.Series([1, 2]) - df.loc[np.int8(1)] = pd.Series([1, 2]) - df.loc[np.int32(1)] = [2, 3] - df.loc[np.uint64(1)] = [2, 3] - - -@pytest.mark.parametrize("col", [1, None]) -@pytest.mark.parametrize("typ", [list, tuple, deque, UserList, iter]) -def test_loc_iterable(col: Hashable, typ: type) -> None: - # GH 189, GH 1410 - df = pd.DataFrame({1: [1, 2], None: 5}, columns=pd.Index([1, None], dtype=object)) - check(df.loc[:, typ([col])], pd.DataFrame) - - if TYPE_CHECKING: - assert_type(df.loc[:, [None]], pd.DataFrame) - assert_type(df.loc[:, [1]], pd.DataFrame) - - assert_type(df.loc[:, (None,)], pd.DataFrame) - assert_type(df.loc[:, (1,)], pd.DataFrame) - - assert_type(df.loc[:, deque([None])], pd.DataFrame) - assert_type(df.loc[:, deque([1])], pd.DataFrame) - - assert_type(df.loc[:, UserList([None])], pd.DataFrame) - assert_type(df.loc[:, UserList([1])], pd.DataFrame) - - assert_type(df.loc[:, (None for _ in [0])], pd.DataFrame) - assert_type(df.loc[:, (1 for _ in [0])], pd.DataFrame) - - def test_dict_items() -> None: # GH 180 x = {"a": [1]} @@ -3930,23 +2993,25 @@ def func() -> MyDataFrame[int]: def test_to_xarray() -> None: - check(assert_type(DF.to_xarray(), xr.Dataset), xr.Dataset) + df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) + check(assert_type(df.to_xarray(), xr.Dataset), xr.Dataset) def test_to_records() -> None: - check(assert_type(DF.to_records(False, "int8"), np.recarray), np.recarray) + df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) + check(assert_type(df.to_records(False, "int8"), np.recarray), np.recarray) check( - assert_type(DF.to_records(False, index_dtypes=np.int8), np.recarray), + assert_type(df.to_records(False, index_dtypes=np.int8), np.recarray), np.recarray, ) check( assert_type( - DF.to_records(False, {"col1": np.int8, "col2": np.int16}), np.recarray + df.to_records(False, {"col1": np.int8, "col2": np.int16}), np.recarray ), np.recarray, ) dtypes = {"col1": np.int8, "col2": np.int16} - check(assert_type(DF.to_records(False, dtypes), np.recarray), np.recarray) + check(assert_type(df.to_records(False, dtypes), np.recarray), np.recarray) def test_to_dict_simple() -> None: @@ -4074,28 +3139,6 @@ def test_xs_key() -> None: check(assert_type(df.xs(0, level="foo"), pd.DataFrame | pd.Series), pd.DataFrame) -def test_loc_slice() -> None: - """Test DataFrame.loc with a slice, Index, Series.""" - # GH277 - df1 = pd.DataFrame( - {"x": [1, 2, 3, 4]}, - index=pd.MultiIndex.from_product([[1, 2], ["a", "b"]], names=["num", "let"]), - ) - check(assert_type(df1.loc[1, :], pd.Series | pd.DataFrame), pd.DataFrame) - check(assert_type(df1[::-1], pd.DataFrame), pd.DataFrame) - - # GH1299 - ind = pd.Index(["a", "b"]) - mask = pd.Series([True, False]) - mask_col = pd.Series([True, False], index=pd.Index(["a", "b"])) - df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) - - # loc with index for columns - check(assert_type(df.loc[mask, ind], pd.DataFrame), pd.DataFrame) - # loc with index for columns - check(assert_type(df.loc[mask, mask_col], pd.DataFrame), pd.DataFrame) - - def where_cond1(x: int) -> bool: return x % 2 == 0 @@ -4144,28 +3187,12 @@ def cond1(x: int) -> bool: check(assert_type(df.mask(cond1, inplace=True), None), pd.DataFrame) -def test_setitem_loc() -> None: - # GH 254 - df = pd.DataFrame.from_dict( - dict.fromkeys(["A", "B", "C"], (True, True, True)), orient="index" - ) - df.loc[["A", "C"]] = False - my_arr = ["A", "C"] - df.loc[my_arr] = False - - def test_replace_na() -> None: # GH 262 frame = pd.DataFrame(["N/A", "foo", "bar"]) check(assert_type(frame.replace("N/A", pd.NA), pd.DataFrame), pd.DataFrame) -def test_isetframe() -> None: - frame = pd.DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) - check(assert_type(frame.isetitem(0, 10), None), type(None)) - check(assert_type(frame.isetitem([0], [10, 12]), None), type(None)) - - def test_reset_index_150_changes() -> None: frame = pd.DataFrame({"a": [1, 2, 3, 4]}, index=[-10, -9, -8, -7]) check( @@ -4247,19 +3274,6 @@ def test_series_added_in_astype() -> None: check(assert_type(df.astype(df.dtypes), pd.DataFrame), pd.DataFrame) -def test_series_groupby_and_value_counts() -> None: - df = pd.DataFrame( - { - "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"], - "Max Speed": [380, 370, 24, 26], - } - ) - c1 = df.groupby("Animal")["Max Speed"].value_counts() - c2 = df.groupby("Animal")["Max Speed"].value_counts(normalize=True) - check(assert_type(c1, "pd.Series[int]"), pd.Series, np.integer) - check(assert_type(c2, "pd.Series[float]"), pd.Series, float) - - def test_axes_as_tuple() -> None: # GH 384 index = (3, 5, 7) @@ -4277,72 +3291,6 @@ def test_astype_dict() -> None: check(assert_type(df.astype({"a": "int", 43: "float"}), pd.DataFrame), pd.DataFrame) -def test_setitem_none() -> None: - df = pd.DataFrame( - {"A": [1, 2, 3], "B": ["abc", "def", "ghi"]}, index=["x", "y", "z"] - ) - df.loc["x", "B"] = None - df.iloc[2, 0] = None - sb = pd.Series([1, 2, 3], dtype=int) - sb.loc["y"] = None - sb.iloc[0] = None - - -def test_groupby_and_transform() -> None: - df = pd.DataFrame( - { - "A": ["foo", "bar", "foo", "bar", "foo", "bar"], - "B": ["one", "one", "two", "three", "two", "two"], - "C": [1, 5, 5, 2, 5, 5], - "D": [2.0, 5.0, 8.0, 1.0, 2.0, 9.0], - } - ) - ser = pd.Series( - [390.0, 350.0, 30.0, 20.0], - index=["Falcon", "Falcon", "Parrot", "Parrot"], - name="Max Speed", - ) - grouped = df.groupby("A")[["C", "D"]] - grouped1 = ser.groupby(ser > 100) - c1 = grouped.transform("sum") - c2 = grouped.transform(lambda x: (x - x.mean()) / x.std()) - c3 = grouped1.transform("cumsum") - c4 = grouped1.transform(lambda x: x.max() - x.min()) - check(assert_type(c1, pd.DataFrame), pd.DataFrame) - check(assert_type(c2, pd.DataFrame), pd.DataFrame) - check(assert_type(c3, pd.Series), pd.Series) - check(assert_type(c4, pd.Series), pd.Series) - - -def test_getattr_and_dataframe_groupby() -> None: - df = pd.DataFrame( - data={"col1": [1, 1, 2], "col2": [3, 4, 5], "col3": [0, 1, 0], 0: [-1, -1, -1]} - ) - with pytest_warns_bounded( - FutureWarning, - r"The provided callable is currently using", - upper="2.3.99", - ): - check(assert_type(df.groupby("col1").col3.agg(min), pd.Series), pd.Series) - check( - assert_type(df.groupby("col1").col3.agg([min, max]), pd.DataFrame), - pd.DataFrame, - ) - - -def test_getsetitem_multiindex() -> None: - # GH 466 - rows = pd.Index(["project A", "project B", "project C"]) - years: tuple[str, ...] = ("Year 1", "Year 2", "Year 3") - quarters: tuple[str, ...] = ("Q1", "Q2", "Q3", "Q4") - index_tuples: list[tuple[str, ...]] = list(itertools.product(years, quarters)) - cols = pd.MultiIndex.from_tuples(index_tuples) - budget = pd.DataFrame(index=rows, columns=cols) - multi_index: tuple[str, str] = ("Year 1", "Q1") - budget.loc["project A", multi_index] = 4700 - check(assert_type(budget.loc["project A", multi_index], Scalar), int) - - def test_frame_dropna_subset() -> None: # GH 434 data = {"col1": [1, 3, 4], "col2": [2, 3, 5], "col3": [2, 4, 4]} @@ -4353,44 +3301,6 @@ def test_frame_dropna_subset() -> None: ) -def test_loc_callable() -> None: - # GH 256 - df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - - def select1(df: pd.DataFrame) -> pd.Series: - return df["x"] > 2.0 - - check(assert_type(df.loc[select1], pd.DataFrame), pd.DataFrame) - check(assert_type(df.loc[select1, :], pd.DataFrame), pd.DataFrame) - - def select2(df: pd.DataFrame) -> list[Hashable]: - return [i for i in df.index if cast(int, i) % 2 == 1] - - check(assert_type(df.loc[select2, "x"], pd.Series), pd.Series) - - def select3(_: pd.DataFrame) -> int: - return 1 - - check(assert_type(df.loc[select3, "x"], Scalar), np.integer) - - check( - assert_type(df.loc[:, lambda df: df.columns.str.startswith("x")], pd.DataFrame), - pd.DataFrame, - ) - - -def test_npint_loc_indexer() -> None: - # GH 508 - - df = pd.DataFrame({"x": [1, 2, 3]}, index=np.array([10, 20, 30], dtype="uint64")) - - def get_NDArray(df: pd.DataFrame, key: npt.NDArray[np.uint64]) -> pd.DataFrame: - return df.loc[key] - - a: npt.NDArray[np.uint64] = np.array([10, 30], dtype="uint64") - check(assert_type(get_NDArray(df, a), pd.DataFrame), pd.DataFrame) - - def test_in_columns() -> None: # GH 532 (PR) df = pd.DataFrame(np.random.random((3, 4)), columns=["cat", "dog", "rat", "pig"]) @@ -4401,18 +3311,6 @@ def test_in_columns() -> None: check(assert_type(df.groupby(by=cols).sum(), pd.DataFrame), pd.DataFrame) -def test_loc_list_str() -> None: - # GH 1162 (PR) - df = pd.DataFrame( - [[1, 2], [4, 5], [7, 8]], - index=["cobra", "viper", "sidewinder"], - columns=["max_speed", "shield"], - ) - - result = df.loc[["viper", "sidewinder"]] - check(assert_type(result, pd.DataFrame), pd.DataFrame) - - def test_insert_newvalues() -> None: df = pd.DataFrame({"a": [1, 2]}) ab = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) @@ -4530,12 +3428,6 @@ def test_align() -> None: check(assert_type(aligned_df1, pd.DataFrame), pd.DataFrame) -def test_loc_returns_series() -> None: - df1 = pd.DataFrame({"x": [1, 2, 3, 4]}, index=[10, 20, 30, 40]) - df2 = df1.loc[10, :] - check(assert_type(df2, pd.Series | pd.DataFrame), pd.Series) - - def test_to_dict_index() -> None: df = pd.DataFrame({"a": [1, 2], "b": [9, 10]}) check( @@ -4690,39 +3582,6 @@ def test_interpolate() -> None: ) -def test_getitem_generator() -> None: - # GH 685 - check( - assert_type(DF[(f"col{i + 1}" for i in range(2))], pd.DataFrame), pd.DataFrame - ) - - -def test_getitem_dict_keys() -> None: - # GH 770 - some_columns = {"a": [1], "b": [2]} - df = pd.DataFrame.from_dict(some_columns) - check(assert_type(df[some_columns.keys()], pd.DataFrame), pd.DataFrame) - - -def test_frame_setitem_na() -> None: - # GH 743 - df = pd.DataFrame( - {"x": [1, 2, 3], "y": pd.date_range("3/1/2023", "3/3/2023")}, - index=pd.Index(["a", "b", "c"]), - ).convert_dtypes() - - ind = pd.Index(["a", "c"]) - - df.loc[ind, :] = pd.NA - df.iloc[[0, 2], :] = pd.NA - - # reveal_type(df["y"]) gives Series[Any], so we have to cast to tell the - # type checker what kind of type it is when adding to a Timedelta - df["x"] = cast("pd.Series[pd.Timestamp]", df["y"]) + pd.Timedelta(days=3) - df.loc[ind, :] = pd.NaT - df.iloc[[0, 2], :] = pd.NaT - - def test_itertuples() -> None: # GH 822 df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) @@ -4785,28 +3644,6 @@ def test_info() -> None: check(assert_type(df.info(show_counts=None), None), type(None)) -def test_frame_single_slice() -> None: - # GH 572 - df = pd.DataFrame([1, 2, 3]) - check(assert_type(df.loc[:], pd.DataFrame), pd.DataFrame) - - df.loc[:] = 1 + df - - -def test_frame_index_timestamp() -> None: - # GH 620 - dt1 = pd.to_datetime("2023-05-01") - dt2 = pd.to_datetime("2023-05-02") - s = pd.Series([1, 2], index=[dt1, dt2]) - df = pd.DataFrame(s) - # Next result is Series or DataFrame because the index could be a MultiIndex - check(assert_type(df.loc[dt1, :], pd.Series | pd.DataFrame), pd.Series) - check(assert_type(df.loc[[dt1], :], pd.DataFrame), pd.DataFrame) - df2 = pd.DataFrame({"x": s}) - check(assert_type(df2.loc[dt1, "x"], Scalar), np.integer) - check(assert_type(df2.loc[[dt1], "x"], pd.Series), pd.Series, np.integer) - - def test_frame_bool_fails() -> None: # GH 663 @@ -4891,16 +3728,6 @@ def test_combine() -> None: ) -def test_df_loc_dict() -> None: - """Test that we can set a dict to a df.loc result GH1203.""" - df = pd.DataFrame(columns=["X"]) - df.loc[0] = {"X": 0} - check(assert_type(df, pd.DataFrame), pd.DataFrame) - - df.iloc[0] = {"X": 0} - check(assert_type(df, pd.DataFrame), pd.DataFrame) - - def test_unstack() -> None: """Test different types of argument for `fill_value` in DataFrame.unstack.""" df = pd.DataFrame( diff --git a/tests/frame/test_groupby.py b/tests/frame/test_groupby.py new file mode 100644 index 000000000..20c2f013f --- /dev/null +++ b/tests/frame/test_groupby.py @@ -0,0 +1,628 @@ +from __future__ import annotations + +from collections.abc import ( + Callable, + Hashable, + Iterator, +) +from typing import ( + TYPE_CHECKING, + Any, +) + +import numpy as np +import pandas as pd +from typing_extensions import assert_type + +from pandas._typing import Scalar + +from tests import ( + PD_LTE_23, + check, + pytest_warns_bounded, +) + +if TYPE_CHECKING: + from pandas._typing import S1 + + +def test_types_groupby_as_index() -> None: + """Test type of groupby.size method depending on `as_index`.""" + df = pd.DataFrame({"a": [1, 2, 3]}) + check( + assert_type( + df.groupby("a", as_index=False).size(), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + df.groupby("a", as_index=True).size(), + "pd.Series[int]", + ), + pd.Series, + ) + check( + assert_type( + df.groupby("a").size(), + "pd.Series[int]", + ), + pd.Series, + ) + + +def test_types_groupby_as_index_list() -> None: + """Test type of groupby.size method depending on list of grouper GH1045.""" + df = pd.DataFrame({"a": [1, 1, 2], "b": [2, 3, 2]}) + check( + assert_type( + df.groupby(["a", "b"], as_index=False).size(), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + df.groupby(["a", "b"], as_index=True).size(), + "pd.Series[int]", + ), + pd.Series, + ) + check( + assert_type( + df.groupby(["a", "b"]).size(), + "pd.Series[int]", + ), + pd.Series, + ) + + +def test_types_groupby_as_index_value_counts() -> None: + """Test type of groupby.value_counts method depending on `as_index`.""" + df = pd.DataFrame({"a": [1, 2, 3]}) + check( + assert_type( + df.groupby("a", as_index=False).value_counts(), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + df.groupby("a", as_index=True).value_counts(), + "pd.Series[int]", + ), + pd.Series, + ) + + +def test_types_groupby_size() -> None: + """Test for GH886.""" + data = [ + {"date": "2023-12-01", "val": 12}, + {"date": "2023-12-02", "val": 2}, + {"date": "2023-12-03", "val": 1}, + {"date": "2023-12-03", "val": 10}, + ] + + df = pd.DataFrame(data) + groupby = df.groupby("date") + size = groupby.size() + frame = size.to_frame() + check(assert_type(frame.reset_index(), pd.DataFrame), pd.DataFrame) + + +def test_types_groupby() -> None: + df = pd.DataFrame(data={"col1": [1, 1, 2], "col2": [3, 4, 5], "col3": [0, 1, 0]}) + df.index.name = "ind" + df.groupby(by="col1") + df.groupby(level="ind") + df.groupby(by="col1", sort=False, as_index=True) + df.groupby(by=["col1", "col2"]) + # GH 284 + df.groupby(df["col1"] > 2) + df.groupby([df["col1"] > 2, df["col2"] % 2 == 1]) + df.groupby(lambda x: x) + df.groupby([lambda x: x % 2, lambda x: x % 3]) + df.groupby(np.array([1, 0, 1])) + df.groupby([np.array([1, 0, 0]), np.array([0, 0, 1])]) + df.groupby({1: 1, 2: 2, 3: 3}) + df.groupby([{1: 1, 2: 1, 3: 2}, {1: 1, 2: 2, 3: 2}]) + df.groupby(df.index) + df.groupby([pd.Index([1, 0, 0]), pd.Index([0, 0, 1])]) + df.groupby(pd.Grouper(level=0)) + df.groupby([pd.Grouper(level=0), pd.Grouper(key="col1")]) + + check(assert_type(df.groupby(by="col1").agg("sum"), pd.DataFrame), pd.DataFrame) + check( + assert_type(df.groupby(level="ind").aggregate("sum"), pd.DataFrame), + pd.DataFrame, + ) + check( + assert_type( + df.groupby(by="col1", sort=False, as_index=True).transform( + lambda x: x.max() + ), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type(df.groupby(by=["col1", "col2"]).count(), pd.DataFrame), pd.DataFrame + ) + check( + assert_type( + df.groupby(by=["col1", "col2"]).filter(lambda x: x["col1"] > 0), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type(df.groupby(by=["col1", "col2"]).nunique(), pd.DataFrame), + pd.DataFrame, + ) + with pytest_warns_bounded( + FutureWarning, + "(The provided callable is currently using|The behavior of DataFrame.sum with)", + upper="2.3.99", + ): + with pytest_warns_bounded( + FutureWarning, + "DataFrameGroupBy.apply operated on the grouping columns", + upper="2.3.99", + ): + if PD_LTE_23: + check( + assert_type(df.groupby(by="col1").apply(sum), pd.DataFrame), + pd.DataFrame, + ) + check(assert_type(df.groupby("col1").transform("sum"), pd.DataFrame), pd.DataFrame) + s1 = df.set_index("col1")["col2"] + check(assert_type(s1, pd.Series), pd.Series) + check(assert_type(s1.groupby("col1").transform("sum"), pd.Series), pd.Series) + + +def test_types_groupby_methods() -> None: + df = pd.DataFrame(data={"col1": [1, 1, 2], "col2": [3, 4, 5], "col3": [0, 1, 0]}) + check(assert_type(df.groupby("col1").sum(), pd.DataFrame), pd.DataFrame) + check(assert_type(df.groupby("col1").prod(), pd.DataFrame), pd.DataFrame) + check(assert_type(df.groupby("col1").sample(), pd.DataFrame), pd.DataFrame) + check(assert_type(df.groupby("col1").count(), pd.DataFrame), pd.DataFrame) + check( + assert_type(df.groupby("col1").value_counts(normalize=False), "pd.Series[int]"), + pd.Series, + np.integer, + ) + check( + assert_type( + df.groupby("col1").value_counts(subset=None, normalize=True), + "pd.Series[float]", + ), + pd.Series, + float, + ) + check(assert_type(df.groupby("col1").idxmax(), pd.DataFrame), pd.DataFrame) + check(assert_type(df.groupby("col1").idxmin(), pd.DataFrame), pd.DataFrame) + + +def test_types_groupby_agg() -> None: + df = pd.DataFrame( + data={"col1": [1, 1, 2], "col2": [3, 4, 5], "col3": [0, 1, 0], 0: [-1, -1, -1]} + ) + check(assert_type(df.groupby("col1").agg("min"), pd.DataFrame), pd.DataFrame) + check( + assert_type(df.groupby("col1").agg(["min", "max"]), pd.DataFrame), pd.DataFrame + ) + agg_dict1 = {"col2": "min", "col3": "max", 0: "sum"} + check(assert_type(df.groupby("col1").agg(agg_dict1), pd.DataFrame), pd.DataFrame) + + def wrapped_min(x: pd.Series[S1]) -> S1: + return x.min() + + with pytest_warns_bounded( + FutureWarning, + r"The provided callable is currently using", + upper="2.3.99", + ): + check(assert_type(df.groupby("col1")["col3"].agg(min), pd.Series), pd.Series) + check( + assert_type(df.groupby("col1")["col3"].agg([min, max]), pd.DataFrame), + pd.DataFrame, + ) + check(assert_type(df.groupby("col1").agg(min), pd.DataFrame), pd.DataFrame) + check( + assert_type(df.groupby("col1").agg([min, max]), pd.DataFrame), pd.DataFrame + ) + agg_dict2 = {"col2": min, "col3": max, 0: min} + check( + assert_type(df.groupby("col1").agg(agg_dict2), pd.DataFrame), pd.DataFrame + ) + + # Here, MyPy infers dict[object, object], so it must be explicitly annotated + agg_dict3: dict[str | int, str | Callable[..., Any]] = { + "col2": min, + "col3": "max", + 0: wrapped_min, + } + check( + assert_type(df.groupby("col1").agg(agg_dict3), pd.DataFrame), pd.DataFrame + ) + agg_dict4 = {"col2": "sum"} + check(assert_type(df.groupby("col1").agg(agg_dict4), pd.DataFrame), pd.DataFrame) + agg_dict5 = {0: "sum"} + check(assert_type(df.groupby("col1").agg(agg_dict5), pd.DataFrame), pd.DataFrame) + named_agg = pd.NamedAgg(column="col2", aggfunc="max") + check( + assert_type(df.groupby("col1").agg(new_col=named_agg), pd.DataFrame), + pd.DataFrame, + ) + # GH#187 + cols: list[str] = ["col1", "col2"] + check(assert_type(df.groupby(by=cols).sum(), pd.DataFrame), pd.DataFrame) + + cols_opt: list[str | None] = ["col1", "col2"] + check(assert_type(df.groupby(by=cols_opt).sum(), pd.DataFrame), pd.DataFrame) + + cols_mixed: list[str | int] = ["col1", 0] + check(assert_type(df.groupby(by=cols_mixed).sum(), pd.DataFrame), pd.DataFrame) + # GH 736 + check(assert_type(df.groupby(by="col1").aggregate("size"), pd.Series), pd.Series) + check(assert_type(df.groupby(by="col1").agg("size"), pd.Series), pd.Series) + + +# This was added in 1.1.0 https://pandas.pydata.org/docs/whatsnew/v1.1.0.html +def test_types_group_by_with_dropna_keyword() -> None: + df = pd.DataFrame( + data={"col1": [1, 1, 2, 1], "col2": [2, None, 1, 2], "col3": [3, 4, 3, 2]} + ) + check( + assert_type(df.groupby(by="col2", dropna=True).sum(), pd.DataFrame), + pd.DataFrame, + ) + check( + assert_type(df.groupby(by="col2", dropna=False).sum(), pd.DataFrame), + pd.DataFrame, + ) + check(assert_type(df.groupby(by="col2").sum(), pd.DataFrame), pd.DataFrame) + + +def test_types_groupby_any() -> None: + df = pd.DataFrame( + data={ + "col1": [1, 1, 2], + "col2": [True, False, False], + "col3": [False, False, False], + } + ) + check(assert_type(df.groupby("col1").any(), pd.DataFrame), pd.DataFrame) + check(assert_type(df.groupby("col1").all(), pd.DataFrame), pd.DataFrame) + check( + assert_type(df.groupby("col1")["col2"].any(), "pd.Series[bool]"), + pd.Series, + np.bool_, + ) + check( + assert_type(df.groupby("col1")["col2"].any(), "pd.Series[bool]"), + pd.Series, + np.bool_, + ) + + +def test_types_groupby_iter() -> None: + df = pd.DataFrame(data={"col1": [1, 1, 2], "col2": [3, 4, 5]}) + series_groupby = pd.Series([True, True, False], dtype=bool) + first_group = next(iter(df.groupby(series_groupby))) + check( + assert_type(first_group[0], bool), + bool, + ) + check( + assert_type(first_group[1], pd.DataFrame), + pd.DataFrame, + ) + + +def test_types_groupby_level() -> None: + # GH 836 + data = { + "col1": [0, 0, 0], + "col2": [0, 1, 0], + "col3": [1, 2, 3], + "col4": [1, 2, 3], + } + df = pd.DataFrame(data=data).set_index(["col1", "col2", "col3"]) + check( + assert_type(df.groupby(level=["col1", "col2"]).sum(), pd.DataFrame), + pd.DataFrame, + ) + + +def test_groupby_series_methods() -> None: + df = pd.DataFrame({"x": [1, 2, 2, 3, 3], "y": [10, 20, 30, 40, 50]}) + gb = df.groupby("x")["y"] + check(assert_type(gb.describe(), pd.DataFrame), pd.DataFrame) + check(assert_type(gb.count().loc[2], int), np.integer) + check(assert_type(gb.pct_change(), pd.Series), pd.Series) + check(assert_type(gb.bfill(), pd.Series), pd.Series) + check(assert_type(gb.cummax(), pd.Series), pd.Series) + check(assert_type(gb.cummin(), pd.Series), pd.Series) + check(assert_type(gb.cumprod(), pd.Series), pd.Series) + check(assert_type(gb.cumsum(), pd.Series), pd.Series) + check(assert_type(gb.ffill(), pd.Series), pd.Series) + check(assert_type(gb.first(), pd.Series), pd.Series) + check(assert_type(gb.head(), pd.Series), pd.Series) + check(assert_type(gb.last(), pd.Series), pd.Series) + check(assert_type(gb.max(), pd.Series), pd.Series) + check(assert_type(gb.mean(), pd.Series), pd.Series) + check(assert_type(gb.median(), pd.Series), pd.Series) + check(assert_type(gb.min(), pd.Series), pd.Series) + check(assert_type(gb.nlargest(), pd.Series), pd.Series) + check(assert_type(gb.nsmallest(), pd.Series), pd.Series) + check(assert_type(gb.nth(0), pd.DataFrame | pd.Series), pd.Series) + check(assert_type(gb.nth[0, 1, 2], pd.DataFrame | pd.Series), pd.Series) + check(assert_type(gb.nth((0, 1, 2)), pd.DataFrame | pd.Series), pd.Series) + + +def test_groupby_index() -> None: + # GH 42 + df = pd.DataFrame( + data={"col1": [1, 1, 2], "col2": [3, 4, 5], "col3": [0, 1, 0]} + ).set_index("col1") + check(assert_type(df.groupby(df.index).min(), pd.DataFrame), pd.DataFrame) + + +def test_groupby_result() -> None: + # GH 142 + df = pd.DataFrame({"a": [0, 1, 2], "b": [4, 5, 6], "c": [7, 8, 9]}) + iterator = df.groupby(["a", "b"]).__iter__() + assert_type(iterator, Iterator[tuple[tuple[Hashable, ...], pd.DataFrame]]) + index, value = next(iterator) + assert_type((index, value), tuple[tuple[Hashable, ...], pd.DataFrame]) + + if PD_LTE_23: + check(assert_type(index, tuple[Hashable, ...]), tuple, np.integer) + else: + check(assert_type(index, tuple[Hashable, ...]), tuple, int) + + check(assert_type(value, pd.DataFrame), pd.DataFrame) + + iterator2 = df.groupby("a").__iter__() + assert_type(iterator2, Iterator[tuple[Scalar, pd.DataFrame]]) + index2, value2 = next(iterator2) + assert_type((index2, value2), tuple[Scalar, pd.DataFrame]) + + check(assert_type(index2, Scalar), int) + check(assert_type(value2, pd.DataFrame), pd.DataFrame) + + # GH 674 + # grouping by pd.MultiIndex should always resolve to a tuple as well + multi_index = pd.MultiIndex.from_frame(df[["a", "b"]]) + iterator3 = df.groupby(multi_index).__iter__() + assert_type(iterator3, Iterator[tuple[tuple[Hashable, ...], pd.DataFrame]]) + index3, value3 = next(iterator3) + assert_type((index3, value3), tuple[tuple[Hashable, ...], pd.DataFrame]) + + check(assert_type(index3, tuple[Hashable, ...]), tuple, int) + check(assert_type(value3, pd.DataFrame), pd.DataFrame) + + # Want to make sure these cases are differentiated + for (_k1, _k2), _g in df.groupby(["a", "b"]): + pass + + for _kk, _g in df.groupby("a"): + pass + + for (_k1, _k2), _g in df.groupby(multi_index): + pass + + +def test_groupby_result_for_scalar_indexes() -> None: + # GH 674 + dates = pd.date_range("2020-01-01", "2020-12-31") + df = pd.DataFrame({"date": dates, "days": 1}) + period_index = pd.PeriodIndex(df.date, freq="M") + iterator = df.groupby(period_index).__iter__() + assert_type(iterator, Iterator[tuple[pd.Period, pd.DataFrame]]) + index, value = next(iterator) + assert_type((index, value), tuple[pd.Period, pd.DataFrame]) + + check(assert_type(index, pd.Period), pd.Period) + check(assert_type(value, pd.DataFrame), pd.DataFrame) + + dt_index = pd.DatetimeIndex(dates) + iterator2 = df.groupby(dt_index).__iter__() + assert_type(iterator2, Iterator[tuple[pd.Timestamp, pd.DataFrame]]) + index2, value2 = next(iterator2) + assert_type((index2, value2), tuple[pd.Timestamp, pd.DataFrame]) + + check(assert_type(index2, pd.Timestamp), pd.Timestamp) + check(assert_type(value2, pd.DataFrame), pd.DataFrame) + + tdelta_index = pd.TimedeltaIndex(dates - pd.Timestamp("2020-01-01")) + iterator3 = df.groupby(tdelta_index).__iter__() + assert_type(iterator3, Iterator[tuple[pd.Timedelta, pd.DataFrame]]) + index3, value3 = next(iterator3) + assert_type((index3, value3), tuple[pd.Timedelta, pd.DataFrame]) + + check(assert_type(index3, pd.Timedelta), pd.Timedelta) + check(assert_type(value3, pd.DataFrame), pd.DataFrame) + + intervals: list[pd.Interval[pd.Timestamp]] = [ + pd.Interval(date, date + pd.DateOffset(days=1), closed="left") for date in dates + ] + interval_index = pd.IntervalIndex(intervals) + assert_type(interval_index, "pd.IntervalIndex[pd.Interval[pd.Timestamp]]") + iterator4 = df.groupby(interval_index).__iter__() + assert_type(iterator4, Iterator[tuple["pd.Interval[pd.Timestamp]", pd.DataFrame]]) + index4, value4 = next(iterator4) + assert_type((index4, value4), tuple["pd.Interval[pd.Timestamp]", pd.DataFrame]) + + check(assert_type(index4, "pd.Interval[pd.Timestamp]"), pd.Interval) + check(assert_type(value4, pd.DataFrame), pd.DataFrame) + + for _p, _g in df.groupby(period_index): + pass + + for _dt, _g in df.groupby(dt_index): + pass + + for _tdelta, _g in df.groupby(tdelta_index): + pass + + for _interval, _g in df.groupby(interval_index): + pass + + +def test_groupby_result_for_ambiguous_indexes() -> None: + # GH 674 + df = pd.DataFrame({"a": [0, 1, 2], "b": [4, 5, 6], "c": [7, 8, 9]}) + # this will use pd.Index which is ambiguous + iterator = df.groupby(df.index).__iter__() + assert_type(iterator, Iterator[tuple[Any, pd.DataFrame]]) + index, value = next(iterator) + assert_type((index, value), tuple[Any, pd.DataFrame]) + + check(assert_type(index, Any), int) + check(assert_type(value, pd.DataFrame), pd.DataFrame) + + # categorical indexes are also ambiguous + + # https://github.com/pandas-dev/pandas/issues/54054 needs to be fixed + with pytest_warns_bounded( + FutureWarning, + "The default of observed=False is deprecated", + upper="2.3.99", + ): + categorical_index = pd.CategoricalIndex(df.a) + iterator2 = df.groupby(categorical_index).__iter__() + assert_type(iterator2, Iterator[tuple[Any, pd.DataFrame]]) + index2, value2 = next(iterator2) + assert_type((index2, value2), tuple[Any, pd.DataFrame]) + + check(assert_type(index2, Any), int) + check(assert_type(value2, pd.DataFrame), pd.DataFrame) + + +def test_groupby_apply() -> None: + # GH 167 + df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]}) + + def sum_mean(x: pd.DataFrame) -> float: + return x.sum().mean() + + with pytest_warns_bounded( + FutureWarning, + "DataFrameGroupBy.apply operated on the grouping columns.", + lower="2.2.99", + upper="2.99", + ): + check( + assert_type(df.groupby("col1").apply(sum_mean), pd.Series), + pd.Series, + ) + + lfunc: Callable[[pd.DataFrame], float] = lambda x: x.sum().mean() + with pytest_warns_bounded( + FutureWarning, + "DataFrameGroupBy.apply operated on the grouping columns.", + lower="2.2.99", + upper="2.99", + ): + check(assert_type(df.groupby("col1").apply(lfunc), pd.Series), pd.Series) + + def sum_to_list(x: pd.DataFrame) -> list: + return x.sum().tolist() + + with pytest_warns_bounded( + FutureWarning, + "DataFrameGroupBy.apply operated on the grouping columns.", + lower="2.2.99", + upper="2.99", + ): + check(assert_type(df.groupby("col1").apply(sum_to_list), pd.Series), pd.Series) + + def sum_to_series(x: pd.DataFrame) -> pd.Series: + return x.sum() + + with pytest_warns_bounded( + FutureWarning, + "DataFrameGroupBy.apply operated on the grouping columns.", + lower="2.2.99", + upper="2.99", + ): + check( + assert_type(df.groupby("col1").apply(sum_to_series), pd.DataFrame), + pd.DataFrame, + ) + + def sample_to_df(x: pd.DataFrame) -> pd.DataFrame: + return x.sample() + + with pytest_warns_bounded( + FutureWarning, + "DataFrameGroupBy.apply operated on the grouping columns.", + lower="2.2.99", + upper="2.99", + ): + check( + assert_type( + df.groupby("col1", group_keys=False).apply(sample_to_df), pd.DataFrame + ), + pd.DataFrame, + ) + + +def test_series_groupby_and_value_counts() -> None: + df = pd.DataFrame( + { + "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"], + "Max Speed": [380, 370, 24, 26], + } + ) + c1 = df.groupby("Animal")["Max Speed"].value_counts() + c2 = df.groupby("Animal")["Max Speed"].value_counts(normalize=True) + check(assert_type(c1, "pd.Series[int]"), pd.Series, np.integer) + check(assert_type(c2, "pd.Series[float]"), pd.Series, float) + + +def test_groupby_and_transform() -> None: + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar"], + "B": ["one", "one", "two", "three", "two", "two"], + "C": [1, 5, 5, 2, 5, 5], + "D": [2.0, 5.0, 8.0, 1.0, 2.0, 9.0], + } + ) + ser = pd.Series( + [390.0, 350.0, 30.0, 20.0], + index=["Falcon", "Falcon", "Parrot", "Parrot"], + name="Max Speed", + ) + grouped = df.groupby("A")[["C", "D"]] + grouped1 = ser.groupby(ser > 100) + c1 = grouped.transform("sum") + c2 = grouped.transform(lambda x: (x - x.mean()) / x.std()) + c3 = grouped1.transform("cumsum") + c4 = grouped1.transform(lambda x: x.max() - x.min()) + check(assert_type(c1, pd.DataFrame), pd.DataFrame) + check(assert_type(c2, pd.DataFrame), pd.DataFrame) + check(assert_type(c3, pd.Series), pd.Series) + check(assert_type(c4, pd.Series), pd.Series) + + +def test_getattr_and_dataframe_groupby() -> None: + df = pd.DataFrame( + data={"col1": [1, 1, 2], "col2": [3, 4, 5], "col3": [0, 1, 0], 0: [-1, -1, -1]} + ) + with pytest_warns_bounded( + FutureWarning, + r"The provided callable is currently using", + upper="2.3.99", + ): + check(assert_type(df.groupby("col1").col3.agg(min), pd.Series), pd.Series) + check( + assert_type(df.groupby("col1").col3.agg([min, max]), pd.DataFrame), + pd.DataFrame, + ) diff --git a/tests/frame/test_indexing.py b/tests/frame/test_indexing.py new file mode 100644 index 000000000..249ddd127 --- /dev/null +++ b/tests/frame/test_indexing.py @@ -0,0 +1,594 @@ +from __future__ import annotations + +from collections import ( + UserDict, + UserList, + deque, +) +from collections.abc import ( + Hashable, + Iterator, +) +import datetime +from enum import Enum +import itertools +from typing import ( + TYPE_CHECKING, + Any, + cast, +) + +import numpy as np +from numpy import typing as npt +import pandas as pd +import pytest +from typing_extensions import assert_type + +from pandas._typing import Scalar + +from tests import check + + +def test_types_getitem() -> None: + df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4], 5: [6, 7]}) + i = pd.Index(["col1", "col2"]) + s = pd.Series(["col1", "col2"]) + select_df = pd.DataFrame({"col1": [True, True], "col2": [False, True]}) + a = np.array(["col1", "col2"]) + check(assert_type(df["col1"], pd.Series), pd.Series) + check(assert_type(df[5], pd.Series), pd.Series) + check(assert_type(df[["col1", "col2"]], pd.DataFrame), pd.DataFrame) + check(assert_type(df[1:], pd.DataFrame), pd.DataFrame) + check(assert_type(df[s], pd.DataFrame), pd.DataFrame) + check(assert_type(df[a], pd.DataFrame), pd.DataFrame) + check(assert_type(df[select_df], pd.DataFrame), pd.DataFrame) + check(assert_type(df[i], pd.DataFrame), pd.DataFrame) + + +def test_types_getitem_with_hashable() -> None: + # Testing getitem support for hashable types that are not scalar + # Due to the bug in https://github.com/pandas-dev/pandas-stubs/issues/592 + class MyEnum(Enum): + FIRST = "tayyar" + SECOND = "haydar" + + df = pd.DataFrame( + data=[[12.2, 10], [8.8, 15]], columns=[MyEnum.FIRST, MyEnum.SECOND] + ) + check(assert_type(df[MyEnum.FIRST], pd.Series), pd.Series) + check(assert_type(df[1:], pd.DataFrame), pd.DataFrame) + check(assert_type(df[:2], pd.DataFrame), pd.DataFrame) + + df2 = pd.DataFrame(data=[[12.2, 10], [8.8, 15]], columns=[3, 4]) + check(assert_type(df2[3], pd.Series), pd.Series) + check(assert_type(df2[[3]], pd.DataFrame), pd.DataFrame) + check(assert_type(df2[[3, 4]], pd.DataFrame), pd.DataFrame) + + +def test_slice_setitem() -> None: + df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4], 5: [6, 7]}) + df[1:] = [10, 11, 12] + + +def test_types_setitem() -> None: + df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4], 5: [6, 7]}) + h = cast(Hashable, "col1") + i = pd.Index(["col1", "col2"]) + s = pd.Series(["col1", "col2"]) + a = np.array(["col1", "col2"]) + df["col1"] = [1, 2] + df[5] = [5, 6] + df[h] = [5, 6] + df.loc[:, h] = [5, 6] + df.loc[:, UserList([h])] = [[5], [6]] + df.loc[:, iter([h])] = [[5], [6]] + df[["col1", "col2"]] = [[1, 2], [3, 4]] + df[s] = [5, 6] + df.loc[:, s] = [5, 6] + df["col1"] = [5, 6] + df[df["col1"] > 1] = [5, 6, 7] + df[a] = [[1, 2], [3, 4]] + df[i] = [8, 9] + + +def test_types_setitem_mask() -> None: + df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4], 5: [6, 7]}) + select_df = pd.DataFrame({"col1": [True, True], "col2": [False, True]}) + df[select_df] = [1, 2, 3] + + +def test_types_iloc_iat() -> None: + df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) + check(assert_type(df.iloc[1, 1], Scalar), np.integer) + check(assert_type(df.iloc[[1], [1]], pd.DataFrame), pd.DataFrame) + + check(assert_type(df.iat[0, 0], Scalar), np.integer) + + # https://github.com/microsoft/python-type-stubs/issues/31 + check(assert_type(df.iloc[:, [0]], pd.DataFrame), pd.DataFrame) + check(assert_type(df.iloc[:, 0], pd.Series), pd.Series) + + +def test_types_loc_at() -> None: + df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) + check(assert_type(df.loc[[0], "col1"], pd.Series), pd.Series) + check(assert_type(df.loc[0, "col1"], Scalar), np.integer) + + check(assert_type(df.at[0, "col1"], Scalar), np.integer) + + +def test_types_boolean_indexing() -> None: + df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) + check(assert_type(df[df > 1], pd.DataFrame), pd.DataFrame) + check(assert_type(df[~(df > 1.0)], pd.DataFrame), pd.DataFrame) + + row_mask = df["col1"] >= 2 + col_mask = df.columns.isin(["col2"]) + check(assert_type(df.loc[row_mask], pd.DataFrame), pd.DataFrame) + check(assert_type(df.loc[~row_mask], pd.DataFrame), pd.DataFrame) + check(assert_type(df.loc[row_mask, :], pd.DataFrame), pd.DataFrame) + check(assert_type(df.loc[:, col_mask], pd.DataFrame), pd.DataFrame) + check(assert_type(df.loc[row_mask, col_mask], pd.DataFrame), pd.DataFrame) + check(assert_type(df.loc[~row_mask, ~col_mask], pd.DataFrame), pd.DataFrame) + + +def test_indexslice_setitem() -> None: + df = pd.DataFrame( + {"x": [1, 2, 2, 3], "y": [1, 2, 3, 4], "z": [10, 20, 30, 40]} + ).set_index(["x", "y"]) + s = pd.Series([-1, -2]) + df.loc[pd.IndexSlice[2, :]] = s.values + df.loc[pd.IndexSlice[2, :], "z"] = [200, 300] + # GH 314 + df.loc[pd.IndexSlice[pd.Index([2, 3]), :], "z"] = 99 + + +def test_indexslice_getitem() -> None: + # GH 300 + df = ( + pd.DataFrame({"x": [1, 2, 2, 3, 4], "y": [10, 20, 30, 40, 10]}) + .assign(z=lambda df: df.x * df.y) + .set_index(["x", "y"]) + ) + ind = pd.Index([2, 3]) + check( + assert_type( + pd.IndexSlice[ind, :], tuple["pd.Index[int]", "slice[None, None, None]"] + ), + tuple, + ) + check(assert_type(df.loc[pd.IndexSlice[ind, :]], pd.DataFrame), pd.DataFrame) + check(assert_type(df.loc[pd.IndexSlice[1:2]], pd.DataFrame), pd.DataFrame) + check( + assert_type(df.loc[pd.IndexSlice[:, df["z"] > 40], :], pd.DataFrame), + pd.DataFrame, + ) + check(assert_type(df.loc[pd.IndexSlice[2, 30], "z"], Scalar), np.integer) + check( + assert_type(df.loc[pd.IndexSlice[[2, 4], [20, 40]], :], pd.DataFrame), + pd.DataFrame, + ) + # GH 314 + check( + assert_type(df.loc[pd.IndexSlice[pd.Index([2, 4]), :], "z"], pd.Series), + pd.Series, + ) + + +def test_getset_untyped() -> None: + """Test that Dataframe.__getitem__ needs to return untyped series.""" + df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [10, 20, 30, 40, 50]}) + check(assert_type(df["x"].max(), Any), np.integer) + + +def test_getmultiindex_columns() -> None: + mi = pd.MultiIndex.from_product([[1, 2], ["a", "b"]]) + df = pd.DataFrame([[1, 2, 3, 4], [10, 20, 30, 40]], columns=mi) + li: list[tuple[int, str]] = [(1, "a"), (2, "b")] + check(assert_type(df[[(1, "a"), (2, "b")]], pd.DataFrame), pd.DataFrame) + check(assert_type(df[li], pd.DataFrame), pd.DataFrame) + check( + assert_type( + df[[(i, s) for i in [1] for s in df.columns.get_level_values(1)]], + pd.DataFrame, + ), + pd.DataFrame, + ) + check(assert_type(df[[df.columns[0]]], pd.DataFrame), pd.DataFrame) + check(assert_type(df[df.columns[0]], pd.Series), pd.Series) + check(assert_type(df[li[0]], pd.Series), pd.Series) + + +def test_frame_isin() -> None: + df = pd.DataFrame({"x": [1, 2, 3, 4, 5]}, index=[1, 2, 3, 4, 5]) + check(assert_type(df.isin([1, 3, 5]), pd.DataFrame), pd.DataFrame) + check(assert_type(df.isin({1, 3, 5}), pd.DataFrame), pd.DataFrame) + check(assert_type(df.isin(pd.Series([1, 3, 5])), pd.DataFrame), pd.DataFrame) + check(assert_type(df.isin(pd.Index([1, 3, 5])), pd.DataFrame), pd.DataFrame) + check(assert_type(df.isin(df), pd.DataFrame), pd.DataFrame) + check(assert_type(df.isin({"x": [1, 2]}), pd.DataFrame), pd.DataFrame) + check( + assert_type(df.isin(UserDict({"x": iter([1, "2"])})), pd.DataFrame), + pd.DataFrame, + ) + + +def test_frame_getitem_isin() -> None: + df = pd.DataFrame({"x": [1, 2, 3, 4, 5]}, index=[1, 2, 3, 4, 5]) + check(assert_type(df[df.index.isin([1, 3, 5])], pd.DataFrame), pd.DataFrame) + + +def test_columns_mixlist() -> None: + # GH 97 + df = pd.DataFrame({"a": [1, 2, 3], 1: [3, 4, 5]}) + key: list[int | str] + key = [1] + check(assert_type(df[key], pd.DataFrame), pd.DataFrame) + + +def test_frame_scalars_slice() -> None: + # GH 133 + # scalars: + # str, bytes, datetime.date, datetime.datetime, datetime.timedelta, bool, int, + # float, complex, Timestamp, Timedelta + + str_ = "a" + bytes_ = b"7" + date = datetime.date(1999, 12, 31) + datetime_ = datetime.datetime(1999, 12, 31) + timedelta = datetime.datetime(2000, 1, 1) - datetime.datetime(1999, 12, 31) + bool_ = True + int_ = 2 + float_ = 3.14 + complex_ = 1.0 + 3.0j + timestamp = pd.Timestamp(0) + pd_timedelta = pd.Timedelta(0, unit="D") + none = None + idx = [ + str_, + bytes_, + date, + datetime_, + timedelta, + bool_, + int_, + float_, + complex_, + timestamp, + pd_timedelta, + none, + ] + values = np.arange(len(idx))[:, None] + np.arange(len(idx)) + df = pd.DataFrame(values, columns=idx, index=idx) + + # Note: bool_ cannot be tested since the index is object and pandas does not + # support boolean access using loc except when the index is boolean + check(assert_type(df.loc[str_], pd.Series | pd.DataFrame), pd.Series) + check(assert_type(df.loc[bytes_], pd.Series | pd.DataFrame), pd.Series) + check(assert_type(df.loc[date], pd.Series | pd.DataFrame), pd.Series) + check(assert_type(df.loc[datetime_], pd.Series | pd.DataFrame), pd.Series) + check(assert_type(df.loc[timedelta], pd.Series | pd.DataFrame), pd.Series) + check(assert_type(df.loc[int_], pd.Series | pd.DataFrame), pd.Series) + check(assert_type(df.loc[float_], pd.Series | pd.DataFrame), pd.Series) + check(assert_type(df.loc[complex_], pd.Series | pd.DataFrame), pd.Series) + check(assert_type(df.loc[timestamp], pd.Series | pd.DataFrame), pd.Series) + check(assert_type(df.loc[pd_timedelta], pd.Series | pd.DataFrame), pd.Series) + check(assert_type(df.loc[none], pd.Series), pd.Series) + + check(assert_type(df.loc[:, str_], pd.Series), pd.Series) + check(assert_type(df.loc[:, bytes_], pd.Series), pd.Series) + check(assert_type(df.loc[:, date], pd.Series), pd.Series) + check(assert_type(df.loc[:, datetime_], pd.Series), pd.Series) + check(assert_type(df.loc[:, timedelta], pd.Series), pd.Series) + check(assert_type(df.loc[:, int_], pd.Series), pd.Series) + check(assert_type(df.loc[:, float_], pd.Series), pd.Series) + check(assert_type(df.loc[:, complex_], pd.Series), pd.Series) + check(assert_type(df.loc[:, timestamp], pd.Series), pd.Series) + check(assert_type(df.loc[:, pd_timedelta], pd.Series), pd.Series) + check(assert_type(df.loc[:, none], pd.Series), pd.Series) + + # GH749 + + multi_idx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=["alpha", "num"]) + df2 = pd.DataFrame({"col1": range(4)}, index=multi_idx) + check(assert_type(df2.loc[str_], pd.Series | pd.DataFrame), pd.DataFrame) + + df3 = pd.DataFrame({"x": range(2)}, index=pd.Index(["a", "b"])) + check(assert_type(df3.loc[str_], pd.Series | pd.DataFrame), pd.Series) + + # https://github.com/microsoft/python-type-stubs/issues/62 + df7 = pd.DataFrame({"x": [1, 2, 3]}, index=pd.Index(["a", "b", "c"])) + index = pd.Index(["b"]) + check(assert_type(df7.loc[index], pd.DataFrame), pd.DataFrame) + + +def test_boolean_loc() -> None: + # Booleans can only be used in loc when the index is boolean + df = pd.DataFrame([[0, 1], [1, 0]], columns=[True, False], index=[True, False]) + check(assert_type(df.loc[True], pd.Series | pd.DataFrame), pd.Series) + check(assert_type(df.loc[:, False], pd.Series), pd.Series) + + +def test_setitem_list() -> None: + # GH 153 + lst1: list[str] = ["a", "b", "c"] + lst2: list[int] = [1, 2, 3] + lst3: list[float] = [4.0, 5.0, 6.0] + lst4: list[tuple[str, int]] = [("a", 1), ("b", 2), ("c", 3)] + lst5: list[complex] = [0 + 1j, 0 + 2j, 0 + 3j] + + columns: list[Hashable] = [ + "a", + "b", + "c", + 1, + 2, + 3, + 4.0, + 5.0, + 6.0, + ("a", 1), + ("b", 2), + ("c", 3), + 0 + 1j, + 0 + 2j, + 0 + 3j, + ] + + df = pd.DataFrame(np.empty((3, 15)), columns=columns) + + check(assert_type(df.set_index(lst1), pd.DataFrame), pd.DataFrame) + check(assert_type(df.set_index(lst2), pd.DataFrame), pd.DataFrame) + check(assert_type(df.set_index(lst3), pd.DataFrame), pd.DataFrame) + check(assert_type(df.set_index(lst4), pd.DataFrame), pd.DataFrame) + check(assert_type(df.set_index(lst5), pd.DataFrame), pd.DataFrame) + + iter1: Iterator[str] = (v for v in lst1) + iter2: Iterator[tuple[str, int]] = (v for v in lst4) + check(assert_type(df.set_index(iter1), pd.DataFrame), pd.DataFrame) + check(assert_type(df.set_index(iter2), pd.DataFrame), pd.DataFrame) + + +def test_setitem_loc() -> None: + # GH 254 + df = pd.DataFrame.from_dict( + dict.fromkeys(["A", "B", "C"], (True, True, True)), orient="index" + ) + df.loc[["A", "C"]] = False + my_arr = ["A", "C"] + df.loc[my_arr] = False + + +def test_isetframe() -> None: + frame = pd.DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) + check(assert_type(frame.isetitem(0, 10), None), type(None)) + check(assert_type(frame.isetitem([0], [10, 12]), None), type(None)) + + +def test_setitem_none() -> None: + df = pd.DataFrame( + {"A": [1, 2, 3], "B": ["abc", "def", "ghi"]}, index=["x", "y", "z"] + ) + df.loc["x", "B"] = None + df.iloc[2, 0] = None + sb = pd.Series([1, 2, 3], dtype=int) + sb.loc["y"] = None + sb.iloc[0] = None + + +def test_getsetitem_multiindex() -> None: + # GH 466 + rows = pd.Index(["project A", "project B", "project C"]) + years: tuple[str, ...] = ("Year 1", "Year 2", "Year 3") + quarters: tuple[str, ...] = ("Q1", "Q2", "Q3", "Q4") + index_tuples: list[tuple[str, ...]] = list(itertools.product(years, quarters)) + cols = pd.MultiIndex.from_tuples(index_tuples) + budget = pd.DataFrame(index=rows, columns=cols) + multi_index: tuple[str, str] = ("Year 1", "Q1") + budget.loc["project A", multi_index] = 4700 + check(assert_type(budget.loc["project A", multi_index], Scalar), int) + + +def test_getitem_generator() -> None: + # GH 685 + df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) + check( + assert_type(df[(f"col{i + 1}" for i in range(2))], pd.DataFrame), pd.DataFrame + ) + + +def test_getitem_dict_keys() -> None: + # GH 770 + some_columns = {"a": [1], "b": [2]} + df = pd.DataFrame.from_dict(some_columns) + check(assert_type(df[some_columns.keys()], pd.DataFrame), pd.DataFrame) + + +def test_frame_setitem_na() -> None: + # GH 743 + df = pd.DataFrame( + {"x": [1, 2, 3], "y": pd.date_range("3/1/2023", "3/3/2023")}, + index=pd.Index(["a", "b", "c"]), + ).convert_dtypes() + + ind = pd.Index(["a", "c"]) + + df.loc[ind, :] = pd.NA + df.iloc[[0, 2], :] = pd.NA + + # reveal_type(df["y"]) gives Series[Any], so we have to cast to tell the + # type checker what kind of type it is when adding to a Timedelta + df["x"] = cast("pd.Series[pd.Timestamp]", df["y"]) + pd.Timedelta(days=3) + df.loc[ind, :] = pd.NaT + df.iloc[[0, 2], :] = pd.NaT + + +def test_loc_set() -> None: + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df.loc["a"] = [3, 4] + + +def test_loc_int_set() -> None: + df = pd.DataFrame({1: [1, 2], 2: [3, 4]}) + df.loc[1] = [3, 4] + df.loc[np.int_(1)] = pd.Series([1, 2]) + df.loc[np.uint(1)] = pd.Series([1, 2]) + df.loc[np.int8(1)] = pd.Series([1, 2]) + df.loc[np.int32(1)] = [2, 3] + df.loc[np.uint64(1)] = [2, 3] + + +@pytest.mark.parametrize("col", [1, None]) +@pytest.mark.parametrize("typ", [list, tuple, deque, UserList, iter]) +def test_loc_iterable(col: Hashable, typ: type) -> None: + # GH 189, GH 1410 + df = pd.DataFrame({1: [1, 2], None: 5}, columns=pd.Index([1, None], dtype=object)) + check(df.loc[:, typ([col])], pd.DataFrame) + + if TYPE_CHECKING: + assert_type(df.loc[:, [None]], pd.DataFrame) + assert_type(df.loc[:, [1]], pd.DataFrame) + + assert_type(df.loc[:, (None,)], pd.DataFrame) + assert_type(df.loc[:, (1,)], pd.DataFrame) + + assert_type(df.loc[:, deque([None])], pd.DataFrame) + assert_type(df.loc[:, deque([1])], pd.DataFrame) + + assert_type(df.loc[:, UserList([None])], pd.DataFrame) + assert_type(df.loc[:, UserList([1])], pd.DataFrame) + + assert_type(df.loc[:, (None for _ in [0])], pd.DataFrame) + assert_type(df.loc[:, (1 for _ in [0])], pd.DataFrame) + + +def test_loc_slice() -> None: + """Test DataFrame.loc with a slice, Index, Series.""" + # GH277 + df1 = pd.DataFrame( + {"x": [1, 2, 3, 4]}, + index=pd.MultiIndex.from_product([[1, 2], ["a", "b"]], names=["num", "let"]), + ) + check(assert_type(df1.loc[1, :], pd.Series | pd.DataFrame), pd.DataFrame) + check(assert_type(df1[::-1], pd.DataFrame), pd.DataFrame) + + # GH1299 + ind = pd.Index(["a", "b"]) + mask = pd.Series([True, False]) + mask_col = pd.Series([True, False], index=pd.Index(["a", "b"])) + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + + # loc with index for columns + check(assert_type(df.loc[mask, ind], pd.DataFrame), pd.DataFrame) + # loc with index for columns + check(assert_type(df.loc[mask, mask_col], pd.DataFrame), pd.DataFrame) + + +def test_loc_callable() -> None: + # GH 256 + df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + + def select1(df: pd.DataFrame) -> pd.Series: + return df["x"] > 2.0 + + check(assert_type(df.loc[select1], pd.DataFrame), pd.DataFrame) + check(assert_type(df.loc[select1, :], pd.DataFrame), pd.DataFrame) + + def select2(df: pd.DataFrame) -> list[Hashable]: + return [i for i in df.index if cast(int, i) % 2 == 1] + + check(assert_type(df.loc[select2, "x"], pd.Series), pd.Series) + + def select3(_: pd.DataFrame) -> int: + return 1 + + check(assert_type(df.loc[select3, "x"], Scalar), np.integer) + + check( + assert_type(df.loc[:, lambda df: df.columns.str.startswith("x")], pd.DataFrame), + pd.DataFrame, + ) + + +def test_npint_loc_indexer() -> None: + # GH 508 + + df = pd.DataFrame({"x": [1, 2, 3]}, index=np.array([10, 20, 30], dtype="uint64")) + + def get_NDArray(df: pd.DataFrame, key: npt.NDArray[np.uint64]) -> pd.DataFrame: + return df.loc[key] + + a: npt.NDArray[np.uint64] = np.array([10, 30], dtype="uint64") + check(assert_type(get_NDArray(df, a), pd.DataFrame), pd.DataFrame) + + +def test_loc_list_str() -> None: + # GH 1162 (PR) + df = pd.DataFrame( + [[1, 2], [4, 5], [7, 8]], + index=["cobra", "viper", "sidewinder"], + columns=["max_speed", "shield"], + ) + + result = df.loc[["viper", "sidewinder"]] + check(assert_type(result, pd.DataFrame), pd.DataFrame) + + +def test_loc_returns_series() -> None: + df1 = pd.DataFrame({"x": [1, 2, 3, 4]}, index=[10, 20, 30, 40]) + df2 = df1.loc[10, :] + check(assert_type(df2, pd.Series | pd.DataFrame), pd.Series) + + +def test_frame_single_slice() -> None: + # GH 572 + df = pd.DataFrame([1, 2, 3]) + check(assert_type(df.loc[:], pd.DataFrame), pd.DataFrame) + + df.loc[:] = 1 + df + + +def test_frame_index_timestamp() -> None: + # GH 620 + dt1 = pd.to_datetime("2023-05-01") + dt2 = pd.to_datetime("2023-05-02") + s = pd.Series([1, 2], index=[dt1, dt2]) + df = pd.DataFrame(s) + # Next result is Series or DataFrame because the index could be a MultiIndex + check(assert_type(df.loc[dt1, :], pd.Series | pd.DataFrame), pd.Series) + check(assert_type(df.loc[[dt1], :], pd.DataFrame), pd.DataFrame) + df2 = pd.DataFrame({"x": s}) + check(assert_type(df2.loc[dt1, "x"], Scalar), np.integer) + check(assert_type(df2.loc[[dt1], "x"], pd.Series), pd.Series, np.integer) + + +def test_df_loc_dict() -> None: + """Test that we can set a dict to a df.loc result GH1203.""" + df = pd.DataFrame(columns=["X"]) + df.loc[0] = {"X": 0} + check(assert_type(df, pd.DataFrame), pd.DataFrame) + + df.iloc[0] = {"X": 0} + check(assert_type(df, pd.DataFrame), pd.DataFrame) + + +def test_iloc_npint() -> None: + # GH 69 + df = pd.DataFrame({"a": [10, 20, 30], "b": [20, 40, 60], "c": [30, 60, 90]}) + iloc = np.argmin(np.random.standard_normal(3)) + df.iloc[iloc] + + +# https://github.com/pandas-dev/pandas-stubs/issues/143 +def test_iloc_tuple() -> None: + df = pd.DataFrame({"Char": ["A", "B", "C"], "Number": [1, 2, 3]}) + df = df.iloc[0:2,] + + +def test_frame_ndarray_assignmment() -> None: + # GH 100 + df_a = pd.DataFrame({"a": [0.0] * 10}) + df_a.iloc[:, :] = np.array([[-1.0]] * 10) + + df_b = pd.DataFrame({"a": [0.0] * 10, "b": [1.0] * 10}) + df_b.iloc[:, :] = np.array([[-1.0, np.inf]] * 10)