From eadbdf3661e1645db4aebdcd7714deb95ca212a0 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 29 May 2019 18:46:16 +0200 Subject: [PATCH 1/4] PERF: don't call RangeIndex._data unneccesary --- asv_bench/benchmarks/index_object.py | 6 ++++++ doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/indexes/range.py | 19 ++++++++++++++++++ pandas/tests/indexes/test_range.py | 30 ++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 896a20bae2069..ffa1c37e2982d 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -94,6 +94,12 @@ def time_min(self): def time_min_trivial(self): self.idx_inc.min() + def time_get_loc_inc(self): + self.idx_inc.get_loc(900000) + + def time_get_loc_dec(self): + self.idx_dec.get_loc(900000) + class IndexAppend: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a62cac7a94bbd..1bd5d1ea9d922 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -393,6 +393,7 @@ Performance Improvements - Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) - Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`) +- Improved performance when slicing :class:`RangeIndex` (:issue:`26565`) - Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`) - Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`) - Improved performance of :meth:`IntervalIndex.is_monotonic`, :meth:`IntervalIndex.is_monotonic_increasing` and :meth:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index ea14a4c789cd3..8f1907ebf5b5b 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -22,6 +22,8 @@ from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.numeric import Int64Index +from pandas.io.formats.printing import pprint_thing + class RangeIndex(Int64Index): """ @@ -64,6 +66,8 @@ class RangeIndex(Int64Index): _typ = 'rangeindex' _engine_type = libindex.Int64Engine + # check whether self._data has benn called + _has_called_data = False # type: bool # -------------------------------------------------------------------- # Constructors @@ -164,6 +168,8 @@ def _simple_new(cls, start, stop=None, step=None, name=None, for k, v in kwargs.items(): setattr(result, k, v) + result._range = range(result._start, result._stop, result._step) + result._reset_identity() return result @@ -182,6 +188,7 @@ def _constructor(self): @cache_readonly def _data(self): + self._has_called_data = True return np.arange(self._start, self._stop, self._step, dtype=np.int64) @cache_readonly @@ -215,6 +222,9 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None + def _format_with_header(self, header, na_rep='NaN', **kwargs): + return header + [pprint_thing(x) for x in self._range] + # -------------------------------------------------------------------- @property def start(self): @@ -296,6 +306,15 @@ def is_monotonic_decreasing(self): def has_duplicates(self): return False + @Appender(_index_shared_docs['get_loc']) + def get_loc(self, key, method=None, tolerance=None): + if method is None and tolerance is None: + try: + return self._range.index(key) + except ValueError: + raise KeyError(key) + return super().__get_loc(key, method=method, tolerance=tolerance) + def tolist(self): return list(range(self._start, self._stop, self._step)) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index b2c330015081c..d1c555d3f66fa 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -241,6 +241,36 @@ def test_view(self): def test_dtype(self): assert self.index.dtype == np.int64 + def test_has_called_data(self): + # Calling RangeIndex._data caches a array of the same length. + # This tests whether RangeIndex._data has been called by doing methods + idx = RangeIndex(0, 100, 10) + assert idx._has_called_data is False + + repr(idx) + assert idx._has_called_data is False + + str(idx) + assert idx._has_called_data is False + + idx.get_loc(20) + assert idx._has_called_data is False + + df = pd.DataFrame({'a': range(10)}, index=idx) + + df.loc[50] + assert idx._has_called_data is False + + with pytest.raises(KeyError): + df.loc[51] + assert idx._has_called_data is False + + df.loc[10:50] + assert idx._has_called_data is False + + df.iloc[5:10] + assert idx._has_called_data is False + def test_is_monotonic(self): assert self.index.is_monotonic is True assert self.index.is_monotonic_increasing is True From 803a97d68b18d2a32c3424f78d7d5b3380b95091 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 30 May 2019 00:14:11 +0200 Subject: [PATCH 2/4] guard against invalid key --- pandas/core/indexes/range.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 8f1907ebf5b5b..9525fe0152bf1 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -308,12 +308,12 @@ def has_duplicates(self): @Appender(_index_shared_docs['get_loc']) def get_loc(self, key, method=None, tolerance=None): - if method is None and tolerance is None: + if is_integer(key) and method is None and tolerance is None: try: return self._range.index(key) except ValueError: raise KeyError(key) - return super().__get_loc(key, method=method, tolerance=tolerance) + return super().get_loc(key, method=method, tolerance=tolerance) def tolist(self): return list(range(self._start, self._stop, self._step)) From a74db3f5493e4300e11a5ae8f0ceb9afe7bfcdbc Mon Sep 17 00:00:00 2001 From: tp Date: Fri, 31 May 2019 15:29:48 +0200 Subject: [PATCH 3/4] changes --- asv_bench/benchmarks/index_object.py | 2 +- pandas/core/indexes/range.py | 12 +++++++----- pandas/tests/indexes/test_range.py | 27 ++++++++++++++++----------- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index ffa1c37e2982d..78fe2ae966896 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -98,7 +98,7 @@ def time_get_loc_inc(self): self.idx_inc.get_loc(900000) def time_get_loc_dec(self): - self.idx_dec.get_loc(900000) + self.idx_dec.get_loc(100000) class IndexAppend: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 9525fe0152bf1..7e3f4cd1b9303 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -67,7 +67,7 @@ class RangeIndex(Int64Index): _engine_type = libindex.Int64Engine # check whether self._data has benn called - _has_called_data = False # type: bool + _cached_data = None # type: np.ndarray # -------------------------------------------------------------------- # Constructors @@ -186,10 +186,12 @@ def _constructor(self): """ return the class to use for construction """ return Int64Index - @cache_readonly + @property def _data(self): - self._has_called_data = True - return np.arange(self._start, self._stop, self._step, dtype=np.int64) + if self._cached_data is None: + self._cached_data = np.arange(self._start, self._stop, self._step, + dtype=np.int64) + return self._cached_data @cache_readonly def _int64index(self): @@ -223,7 +225,7 @@ def _format_data(self, name=None): return None def _format_with_header(self, header, na_rep='NaN', **kwargs): - return header + [pprint_thing(x) for x in self._range] + return header + list(map(pprint_thing, self._range)) # -------------------------------------------------------------------- @property diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index d1c555d3f66fa..b118727154c0c 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -241,35 +241,40 @@ def test_view(self): def test_dtype(self): assert self.index.dtype == np.int64 - def test_has_called_data(self): - # Calling RangeIndex._data caches a array of the same length. - # This tests whether RangeIndex._data has been called by doing methods + def test_cached_data(self): + # Calling RangeIndex._data caches an int64 array of the same length at + # self._cached_data. This tests whether _cached_data has been set. idx = RangeIndex(0, 100, 10) - assert idx._has_called_data is False + + assert idx._cached_data is None repr(idx) - assert idx._has_called_data is False + assert idx._cached_data is None str(idx) - assert idx._has_called_data is False + assert idx._cached_data is None idx.get_loc(20) - assert idx._has_called_data is False + assert idx._cached_data is None df = pd.DataFrame({'a': range(10)}, index=idx) df.loc[50] - assert idx._has_called_data is False + assert idx._cached_data is None with pytest.raises(KeyError): df.loc[51] - assert idx._has_called_data is False + assert idx._cached_data is None df.loc[10:50] - assert idx._has_called_data is False + assert idx._cached_data is None df.iloc[5:10] - assert idx._has_called_data is False + assert idx._cached_data is None + + # actually calling data._data + assert isinstance(idx._data, np.ndarray) + assert isinstance(idx._cached_data, np.ndarray) def test_is_monotonic(self): assert self.index.is_monotonic is True From c72758b8e2d24a33ba95abb8d9d032d98268cff4 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 1 Jun 2019 17:11:48 +0200 Subject: [PATCH 4/4] Doc string changes --- pandas/core/indexes/range.py | 7 +++++++ pandas/tests/indexes/test_range.py | 1 + 2 files changed, 8 insertions(+) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 7e3f4cd1b9303..9401de3346ccd 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -188,6 +188,13 @@ def _constructor(self): @property def _data(self): + """ + An int array that for performance reasons is created only when needed. + + The constructed array is saved in ``_cached_data``. This allows us to + check if the array has been created without accessing ``_data`` and + triggering the construction. + """ if self._cached_data is None: self._cached_data = np.arange(self._start, self._stop, self._step, dtype=np.int64) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index b118727154c0c..477a4e527f278 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -242,6 +242,7 @@ def test_dtype(self): assert self.index.dtype == np.int64 def test_cached_data(self): + # GH 26565 # Calling RangeIndex._data caches an int64 array of the same length at # self._cached_data. This tests whether _cached_data has been set. idx = RangeIndex(0, 100, 10)