|
1 | 1 | from __future__ import annotations
|
2 | 2 |
|
3 |
| -import collections |
4 |
| -from collections import Counter |
5 | 3 | from decimal import Decimal
|
6 | 4 | import operator
|
7 | 5 | import os
|
|
24 | 22 |
|
25 | 23 | from pandas.compat import pa_version_under10p1
|
26 | 24 |
|
27 |
| -from pandas.core.dtypes.common import ( |
28 |
| - is_sequence, |
29 |
| - is_string_dtype, |
30 |
| -) |
| 25 | +from pandas.core.dtypes.common import is_string_dtype |
31 | 26 |
|
32 | 27 | import pandas as pd
|
33 | 28 | from pandas import (
|
|
38 | 33 | MultiIndex,
|
39 | 34 | RangeIndex,
|
40 | 35 | Series,
|
41 |
| - date_range, |
42 |
| - period_range, |
43 |
| - timedelta_range, |
44 | 36 | )
|
45 | 37 | from pandas._testing._io import (
|
46 | 38 | round_trip_localpath,
|
@@ -332,229 +324,6 @@ def to_array(obj):
|
332 | 324 | return extract_array(obj, extract_numpy=True)
|
333 | 325 |
|
334 | 326 |
|
335 |
| -# ----------------------------------------------------------------------------- |
336 |
| -# Others |
337 |
| - |
338 |
| - |
339 |
| -def makeCustomIndex( |
340 |
| - nentries, |
341 |
| - nlevels, |
342 |
| - prefix: str = "#", |
343 |
| - names: bool | str | list[str] | None = False, |
344 |
| - ndupe_l=None, |
345 |
| - idx_type=None, |
346 |
| -) -> Index: |
347 |
| - """ |
348 |
| - Create an index/multindex with given dimensions, levels, names, etc' |
349 |
| -
|
350 |
| - nentries - number of entries in index |
351 |
| - nlevels - number of levels (> 1 produces multindex) |
352 |
| - prefix - a string prefix for labels |
353 |
| - names - (Optional), bool or list of strings. if True will use default |
354 |
| - names, if false will use no names, if a list is given, the name of |
355 |
| - each level in the index will be taken from the list. |
356 |
| - ndupe_l - (Optional), list of ints, the number of rows for which the |
357 |
| - label will repeated at the corresponding level, you can specify just |
358 |
| - the first few, the rest will use the default ndupe_l of 1. |
359 |
| - len(ndupe_l) <= nlevels. |
360 |
| - idx_type - "i"/"f"/"s"/"dt"/"p"/"td". |
361 |
| - If idx_type is not None, `idx_nlevels` must be 1. |
362 |
| - "i"/"f" creates an integer/float index, |
363 |
| - "s" creates a string |
364 |
| - "dt" create a datetime index. |
365 |
| - "td" create a datetime index. |
366 |
| -
|
367 |
| - if unspecified, string labels will be generated. |
368 |
| - """ |
369 |
| - if ndupe_l is None: |
370 |
| - ndupe_l = [1] * nlevels |
371 |
| - assert is_sequence(ndupe_l) and len(ndupe_l) <= nlevels |
372 |
| - assert names is None or names is False or names is True or len(names) is nlevels |
373 |
| - assert idx_type is None or ( |
374 |
| - idx_type in ("i", "f", "s", "u", "dt", "p", "td") and nlevels == 1 |
375 |
| - ) |
376 |
| - |
377 |
| - if names is True: |
378 |
| - # build default names |
379 |
| - names = [prefix + str(i) for i in range(nlevels)] |
380 |
| - if names is False: |
381 |
| - # pass None to index constructor for no name |
382 |
| - names = None |
383 |
| - |
384 |
| - # make singleton case uniform |
385 |
| - if isinstance(names, str) and nlevels == 1: |
386 |
| - names = [names] |
387 |
| - |
388 |
| - # specific 1D index type requested? |
389 |
| - idx_func_dict: dict[str, Callable[..., Index]] = { |
390 |
| - "i": lambda n: Index(np.arange(n), dtype=np.int64), |
391 |
| - "f": lambda n: Index(np.arange(n), dtype=np.float64), |
392 |
| - "s": lambda n: Index([f"{i}_{chr(i)}" for i in range(97, 97 + n)]), |
393 |
| - "dt": lambda n: date_range("2020-01-01", periods=n), |
394 |
| - "td": lambda n: timedelta_range("1 day", periods=n), |
395 |
| - "p": lambda n: period_range("2020-01-01", periods=n, freq="D"), |
396 |
| - } |
397 |
| - idx_func = idx_func_dict.get(idx_type) |
398 |
| - if idx_func: |
399 |
| - idx = idx_func(nentries) |
400 |
| - # but we need to fill in the name |
401 |
| - if names: |
402 |
| - idx.name = names[0] |
403 |
| - return idx |
404 |
| - elif idx_type is not None: |
405 |
| - raise ValueError( |
406 |
| - f"{repr(idx_type)} is not a legal value for `idx_type`, " |
407 |
| - "use 'i'/'f'/'s'/'dt'/'p'/'td'." |
408 |
| - ) |
409 |
| - |
410 |
| - if len(ndupe_l) < nlevels: |
411 |
| - ndupe_l.extend([1] * (nlevels - len(ndupe_l))) |
412 |
| - assert len(ndupe_l) == nlevels |
413 |
| - |
414 |
| - assert all(x > 0 for x in ndupe_l) |
415 |
| - |
416 |
| - list_of_lists = [] |
417 |
| - for i in range(nlevels): |
418 |
| - |
419 |
| - def keyfunc(x): |
420 |
| - numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_") |
421 |
| - return [int(num) for num in numeric_tuple] |
422 |
| - |
423 |
| - # build a list of lists to create the index from |
424 |
| - div_factor = nentries // ndupe_l[i] + 1 |
425 |
| - |
426 |
| - # Deprecated since version 3.9: collections.Counter now supports []. See PEP 585 |
427 |
| - # and Generic Alias Type. |
428 |
| - cnt: Counter[str] = collections.Counter() |
429 |
| - for j in range(div_factor): |
430 |
| - label = f"{prefix}_l{i}_g{j}" |
431 |
| - cnt[label] = ndupe_l[i] |
432 |
| - # cute Counter trick |
433 |
| - result = sorted(cnt.elements(), key=keyfunc)[:nentries] |
434 |
| - list_of_lists.append(result) |
435 |
| - |
436 |
| - tuples = list(zip(*list_of_lists)) |
437 |
| - |
438 |
| - # convert tuples to index |
439 |
| - if nentries == 1: |
440 |
| - # we have a single level of tuples, i.e. a regular Index |
441 |
| - name = None if names is None else names[0] |
442 |
| - index = Index(tuples[0], name=name) |
443 |
| - elif nlevels == 1: |
444 |
| - name = None if names is None else names[0] |
445 |
| - index = Index((x[0] for x in tuples), name=name) |
446 |
| - else: |
447 |
| - index = MultiIndex.from_tuples(tuples, names=names) |
448 |
| - return index |
449 |
| - |
450 |
| - |
451 |
| -def makeCustomDataframe( |
452 |
| - nrows, |
453 |
| - ncols, |
454 |
| - c_idx_names: bool | list[str] = True, |
455 |
| - r_idx_names: bool | list[str] = True, |
456 |
| - c_idx_nlevels: int = 1, |
457 |
| - r_idx_nlevels: int = 1, |
458 |
| - data_gen_f=None, |
459 |
| - c_ndupe_l=None, |
460 |
| - r_ndupe_l=None, |
461 |
| - dtype=None, |
462 |
| - c_idx_type=None, |
463 |
| - r_idx_type=None, |
464 |
| -) -> DataFrame: |
465 |
| - """ |
466 |
| - Create a DataFrame using supplied parameters. |
467 |
| -
|
468 |
| - Parameters |
469 |
| - ---------- |
470 |
| - nrows, ncols - number of data rows/cols |
471 |
| - c_idx_names, r_idx_names - False/True/list of strings, yields No names , |
472 |
| - default names or uses the provided names for the levels of the |
473 |
| - corresponding index. You can provide a single string when |
474 |
| - c_idx_nlevels ==1. |
475 |
| - c_idx_nlevels - number of levels in columns index. > 1 will yield MultiIndex |
476 |
| - r_idx_nlevels - number of levels in rows index. > 1 will yield MultiIndex |
477 |
| - data_gen_f - a function f(row,col) which return the data value |
478 |
| - at that position, the default generator used yields values of the form |
479 |
| - "RxCy" based on position. |
480 |
| - c_ndupe_l, r_ndupe_l - list of integers, determines the number |
481 |
| - of duplicates for each label at a given level of the corresponding |
482 |
| - index. The default `None` value produces a multiplicity of 1 across |
483 |
| - all levels, i.e. a unique index. Will accept a partial list of length |
484 |
| - N < idx_nlevels, for just the first N levels. If ndupe doesn't divide |
485 |
| - nrows/ncol, the last label might have lower multiplicity. |
486 |
| - dtype - passed to the DataFrame constructor as is, in case you wish to |
487 |
| - have more control in conjunction with a custom `data_gen_f` |
488 |
| - r_idx_type, c_idx_type - "i"/"f"/"s"/"dt"/"td". |
489 |
| - If idx_type is not None, `idx_nlevels` must be 1. |
490 |
| - "i"/"f" creates an integer/float index, |
491 |
| - "s" creates a string index |
492 |
| - "dt" create a datetime index. |
493 |
| - "td" create a timedelta index. |
494 |
| -
|
495 |
| - if unspecified, string labels will be generated. |
496 |
| -
|
497 |
| - Examples |
498 |
| - -------- |
499 |
| - # 5 row, 3 columns, default names on both, single index on both axis |
500 |
| - >> makeCustomDataframe(5,3) |
501 |
| -
|
502 |
| - # make the data a random int between 1 and 100 |
503 |
| - >> mkdf(5,3,data_gen_f=lambda r,c:randint(1,100)) |
504 |
| -
|
505 |
| - # 2-level multiindex on rows with each label duplicated |
506 |
| - # twice on first level, default names on both axis, single |
507 |
| - # index on both axis |
508 |
| - >> a=makeCustomDataframe(5,3,r_idx_nlevels=2,r_ndupe_l=[2]) |
509 |
| -
|
510 |
| - # DatetimeIndex on row, index with unicode labels on columns |
511 |
| - # no names on either axis |
512 |
| - >> a=makeCustomDataframe(5,3,c_idx_names=False,r_idx_names=False, |
513 |
| - r_idx_type="dt",c_idx_type="u") |
514 |
| -
|
515 |
| - # 4-level multindex on rows with names provided, 2-level multindex |
516 |
| - # on columns with default labels and default names. |
517 |
| - >> a=makeCustomDataframe(5,3,r_idx_nlevels=4, |
518 |
| - r_idx_names=["FEE","FIH","FOH","FUM"], |
519 |
| - c_idx_nlevels=2) |
520 |
| -
|
521 |
| - >> a=mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) |
522 |
| - """ |
523 |
| - assert c_idx_nlevels > 0 |
524 |
| - assert r_idx_nlevels > 0 |
525 |
| - assert r_idx_type is None or ( |
526 |
| - r_idx_type in ("i", "f", "s", "dt", "p", "td") and r_idx_nlevels == 1 |
527 |
| - ) |
528 |
| - assert c_idx_type is None or ( |
529 |
| - c_idx_type in ("i", "f", "s", "dt", "p", "td") and c_idx_nlevels == 1 |
530 |
| - ) |
531 |
| - |
532 |
| - columns = makeCustomIndex( |
533 |
| - ncols, |
534 |
| - nlevels=c_idx_nlevels, |
535 |
| - prefix="C", |
536 |
| - names=c_idx_names, |
537 |
| - ndupe_l=c_ndupe_l, |
538 |
| - idx_type=c_idx_type, |
539 |
| - ) |
540 |
| - index = makeCustomIndex( |
541 |
| - nrows, |
542 |
| - nlevels=r_idx_nlevels, |
543 |
| - prefix="R", |
544 |
| - names=r_idx_names, |
545 |
| - ndupe_l=r_ndupe_l, |
546 |
| - idx_type=r_idx_type, |
547 |
| - ) |
548 |
| - |
549 |
| - # by default, generate data based on location |
550 |
| - if data_gen_f is None: |
551 |
| - data_gen_f = lambda r, c: f"R{r}C{c}" |
552 |
| - |
553 |
| - data = [[data_gen_f(r, c) for c in range(ncols)] for r in range(nrows)] |
554 |
| - |
555 |
| - return DataFrame(data, index, columns, dtype=dtype) |
556 |
| - |
557 |
| - |
558 | 327 | class SubclassedSeries(Series):
|
559 | 328 | _metadata = ["testattr", "name"]
|
560 | 329 |
|
@@ -868,8 +637,6 @@ def shares_memory(left, right) -> bool:
|
868 | 637 | "iat",
|
869 | 638 | "iloc",
|
870 | 639 | "loc",
|
871 |
| - "makeCustomDataframe", |
872 |
| - "makeCustomIndex", |
873 | 640 | "maybe_produces_warning",
|
874 | 641 | "NARROW_NP_DTYPES",
|
875 | 642 | "NP_NAT_OBJECTS",
|
|
0 commit comments