%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")
from optimus import Optimus
op= Optimus(engine="spark")
WARNING:root:Found pyspark version "3.2.0" installed. The pyspark version 3.2 and above has a built-in "pandas APIs on Spark" module ported from Koalas. Try `import pyspark.pandas as ps` instead. WARNING:root:'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. Koalas will set it for you but it does not work if there is a Spark context already launched. INFO:optimus:Operative System:Windows INFO:optimus:Just check that all necessary environments vars are present... INFO:optimus:----- INFO:optimus:SPARK_HOME=C:\opt\spark\spark-3.1.1-bin-hadoop3.2 INFO:optimus:HADOOP_HOME=C:\opt\hadoop-3.2.2 INFO:optimus:PYSPARK_PYTHON=C:\Users\argenisleon\Anaconda3\envs\python38\python.EXE INFO:optimus:PYSPARK_DRIVER_PYTHON=jupyter INFO:optimus:PYSPARK_SUBMIT_ARGS=--conf "spark.sql.catalogImplementation=hive" pyspark-shell INFO:optimus:JAVA_HOME=C:\java INFO:optimus:Pyarrow Installed INFO:optimus:----- INFO:optimus:Starting or getting SparkSession and SparkContext... INFO:optimus:Spark Version:3.1.1 INFO:optimus: ____ __ _ / __ \____ / /_(_)___ ___ __ _______ / / / / __ \/ __/ / __ `__ \/ / / / ___/ / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \____/ .___/\__/_/_/ /_/ /_/\__,_/____/ /_/ INFO:optimus:Transform and Roll out... INFO:optimus:Optimus successfully imported. Have fun :).
# df = op.load.csv("https://raw.githubusercontent.com/hi-primus/optimus/develop-22.1/examples/data/crime.csv")
df = op.load.csv("data/crime.csv")
df.cols.upper()
INCIDENT_NUMBER
1 (object)
|
OFFENSE_CODE
2 (object)
|
OFFENSE_CODE_GROUP
3 (object)
|
OFFENSE_DESCRIPTION
4 (object)
|
DISTRICT
5 (object)
|
REPORTING_AREA
6 (object)
|
SHOOTING
7 (object)
|
OCCURRED_ON_DATE
8 (object)
|
YEAR
9 (object)
|
MONTH
10 (object)
|
DAY_OF_WEEK
11 (object)
|
HOUR
12 (object)
|
UCR_PART
13 (object)
|
STREET
14 (object)
|
Lat
15 (object)
|
Long
16 (object)
|
Location
17 (object)
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
619
|
LARCENY
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
NONE
|
2018-09-02⋅13:00:00
|
2018
|
9
|
SUNDAY
|
13
|
PART⋅ONE
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
I182070943
|
1402
|
VANDALISM
|
VANDALISM
|
C11
|
347
|
NONE
|
2018-08-21⋅00:00:00
|
2018
|
8
|
TUESDAY
|
0
|
PART⋅TWO
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
I182070941
|
3410
|
TOWED
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
NONE
|
2018-09-03⋅19:27:00
|
2018
|
9
|
MONDAY
|
19
|
PART⋅THREE
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
I182070940
|
3114
|
INVESTIGATE⋅PROPERTY
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
NONE
|
2018-09-03⋅21:16:00
|
2018
|
9
|
MONDAY
|
21
|
PART⋅THREE
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
I182070938
|
3114
|
INVESTIGATE⋅PROPERTY
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
NONE
|
2018-09-03⋅21:05:00
|
2018
|
9
|
MONDAY
|
21
|
PART⋅THREE
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
I182070936
|
3820
|
MOTOR⋅VEHICLE⋅ACCIDENT⋅RESPONSE
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
NONE
|
2018-09-03⋅21:09:00
|
2018
|
9
|
MONDAY
|
21
|
PART⋅THREE
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
I182070933
|
724
|
AUTO⋅THEFT
|
AUTO⋅THEFT
|
B2
|
330
|
NONE
|
2018-09-03⋅21:25:00
|
2018
|
9
|
MONDAY
|
21
|
PART⋅ONE
|
NORMANDY⋅ST
|
42.30607218
|
-71.0827326
|
(42.30607218,⋅-71.08273260)
|
I182070932
|
3301
|
VERBAL⋅DISPUTES
|
VERBAL⋅DISPUTE
|
B2
|
584
|
NONE
|
2018-09-03⋅20:39:37
|
2018
|
9
|
MONDAY
|
20
|
PART⋅THREE
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
I182070931
|
301
|
ROBBERY
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
NONE
|
2018-09-03⋅20:48:00
|
2018
|
9
|
MONDAY
|
20
|
PART⋅ONE
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
I182070929
|
3301
|
VERBAL⋅DISPUTES
|
VERBAL⋅DISPUTE
|
C11
|
364
|
NONE
|
2018-09-03⋅20:38:00
|
2018
|
9
|
MONDAY
|
20
|
PART⋅THREE
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
(42.29514664,⋅-71.05860832)
|
I182070928
|
3301
|
VERBAL⋅DISPUTES
|
VERBAL⋅DISPUTE
|
C6
|
913
|
NONE
|
2018-09-03⋅19:55:00
|
2018
|
9
|
MONDAY
|
19
|
PART⋅THREE
|
OCEAN⋅VIEW⋅DR
|
42.31957856
|
-71.04032766
|
(42.31957856,⋅-71.04032766)
|
df.data["OFFENSE_CODE"].astype("float")
0 619.0 1 1402.0 2 3410.0 3 3114.0 4 3114.0 5 3820.0 6 724.0 7 3301.0 8 301.0 9 3301.0 10 3301.0 11 3114.0 12 3108.0 13 2647.0 14 3201.0 15 3006.0 16 3301.0 17 3305.0 18 2647.0 19 614.0 20 3006.0 21 3801.0 22 3006.0 23 3803.0 24 522.0 25 3831.0 26 3006.0 27 802.0 28 2007.0 29 2900.0 30 2907.0 31 2629.0 32 802.0 33 2662.0 34 3207.0 35 614.0 36 613.0 37 3109.0 38 2612.0 39 1843.0 40 3125.0 41 1841.0 42 301.0 43 1402.0 44 3802.0 45 619.0 46 3801.0 47 1402.0 48 3018.0 49 3301.0 50 3831.0 51 3803.0 52 802.0 53 2405.0 54 617.0 55 3625.0 56 3106.0 57 1402.0 58 3501.0 59 3115.0 60 616.0 61 3807.0 62 301.0 63 1102.0 64 3801.0 65 3006.0 66 3114.0 67 3006.0 68 2629.0 69 3201.0 70 613.0 71 3006.0 72 802.0 73 802.0 74 3115.0 75 3006.0 76 1849.0 77 802.0 78 613.0 79 724.0 80 3831.0 81 3006.0 82 1102.0 83 2647.0 84 3820.0 85 613.0 86 3115.0 87 3301.0 88 3803.0 89 3201.0 90 617.0 91 2905.0 92 3802.0 93 3410.0 94 1102.0 95 2647.0 96 1810.0 97 1842.0 98 613.0 99 3801.0 100 3831.0 101 3006.0 102 3114.0 103 614.0 104 1402.0 105 1806.0 106 1848.0 107 1844.0 108 3831.0 109 3301.0 110 619.0 111 423.0 112 522.0 113 3115.0 114 3006.0 115 3831.0 116 3205.0 117 2647.0 118 413.0 119 1501.0 120 616.0 121 522.0 122 1402.0 123 3001.0 124 3831.0 125 2900.0 126 3410.0 127 2629.0 128 3301.0 129 3115.0 130 301.0 131 3301.0 132 3006.0 133 613.0 134 2900.0 135 3801.0 136 3114.0 137 3410.0 138 3830.0 139 3801.0 140 3301.0 141 3207.0 142 561.0 143 2914.0 144 3006.0 145 423.0 146 802.0 147 2405.0 148 2610.0 149 3006.0 150 619.0 151 3410.0 152 3115.0 153 802.0 154 3301.0 155 1109.0 156 3201.0 157 3115.0 158 3201.0 159 706.0 160 1402.0 161 3115.0 162 1402.0 163 3006.0 164 3301.0 165 614.0 166 3115.0 167 413.0 168 2403.0 169 3801.0 170 3301.0 171 3114.0 172 3115.0 173 3006.0 174 3115.0 175 1402.0 176 301.0 177 423.0 178 3160.0 179 900.0 180 2610.0 181 3301.0 182 3301.0 183 413.0 184 3115.0 185 3125.0 186 3831.0 187 3115.0 188 3114.0 189 3301.0 190 801.0 191 3801.0 192 3301.0 193 3002.0 194 802.0 195 413.0 196 2900.0 197 3410.0 198 802.0 199 3820.0 200 3114.0 201 3301.0 202 802.0 203 802.0 204 802.0 205 619.0 206 520.0 207 2007.0 208 3801.0 209 802.0 210 1402.0 211 613.0 212 802.0 213 619.0 214 3006.0 215 3115.0 216 3114.0 217 3006.0 218 3114.0 219 413.0 220 2405.0 221 2610.0 222 3301.0 223 413.0 224 3006.0 225 3801.0 226 3115.0 227 1402.0 228 2648.0 229 3108.0 230 2905.0 231 3802.0 232 3115.0 233 3803.0 234 3501.0 235 3006.0 236 423.0 237 802.0 238 802.0 239 2647.0 240 3207.0 241 613.0 242 802.0 243 3802.0 244 3803.0 245 3301.0 246 3802.0 247 3301.0 248 1402.0 249 613.0 250 670.0 251 615.0 252 3116.0 253 802.0 254 3802.0 255 3006.0 256 3301.0 257 3201.0 258 1504.0 259 423.0 260 2405.0 261 3006.0 262 2629.0 263 3831.0 264 1830.0 265 3831.0 266 3006.0 267 614.0 268 614.0 269 1402.0 270 2405.0 271 3108.0 272 3820.0 273 361.0 274 3006.0 275 802.0 276 2006.0 277 801.0 278 3831.0 279 3114.0 280 1843.0 281 1849.0 282 1810.0 283 3006.0 284 3831.0 285 3115.0 286 3006.0 287 724.0 288 3831.0 289 616.0 290 1402.0 291 619.0 292 1402.0 293 3410.0 294 802.0 295 616.0 296 3114.0 297 3114.0 298 3114.0 299 311.0 300 3006.0 301 619.0 302 2647.0 303 3301.0 304 3006.0 305 1402.0 306 1402.0 307 301.0 308 3115.0 309 3114.0 310 3831.0 311 3006.0 312 3831.0 313 3201.0 314 613.0 315 3115.0 316 617.0 317 802.0 318 413.0 319 3301.0 320 3410.0 321 3006.0 322 520.0 323 3301.0 324 1001.0 325 2647.0 326 2907.0 327 613.0 328 1830.0 329 3803.0 330 706.0 331 3410.0 332 3006.0 333 3201.0 334 3810.0 335 802.0 336 2647.0 337 2401.0 338 2405.0 339 3501.0 340 2629.0 341 613.0 342 1402.0 343 3501.0 344 3301.0 345 301.0 346 413.0 347 2403.0 348 3301.0 349 413.0 350 3201.0 351 616.0 352 3802.0 353 3831.0 354 2646.0 355 3201.0 356 3115.0 357 3201.0 358 1402.0 359 3001.0 360 613.0 361 3831.0 362 3115.0 363 3802.0 364 3802.0 365 3301.0 366 3821.0 367 619.0 368 3831.0 369 3831.0 370 613.0 371 613.0 372 613.0 373 3811.0 374 2647.0 375 3114.0 376 3201.0 377 3831.0 378 3201.0 379 3006.0 380 3410.0 381 1402.0 382 3115.0 383 3831.0 384 3115.0 385 3410.0 386 361.0 387 3831.0 388 619.0 389 3410.0 390 1402.0 391 619.0 392 1402.0 393 3301.0 394 3006.0 395 3410.0 396 619.0 397 616.0 398 3006.0 399 3001.0 400 3007.0 401 1402.0 402 1402.0 403 3802.0 404 2660.0 405 413.0 406 3115.0 407 619.0 408 802.0 409 3830.0 410 3006.0 411 3114.0 412 301.0 413 3802.0 414 3115.0 415 2647.0 416 3301.0 417 3115.0 418 3114.0 419 3803.0 420 3207.0 421 801.0 422 802.0 423 3114.0 424 3006.0 425 3002.0 426 801.0 427 1402.0 428 3803.0 429 3201.0 430 3006.0 431 3831.0 432 614.0 433 3802.0 434 3115.0 435 1402.0 436 3006.0 437 1402.0 438 423.0 439 3125.0 440 3115.0 441 1831.0 442 3114.0 443 3108.0 444 1402.0 445 706.0 446 3803.0 447 3119.0 448 3820.0 449 3115.0 450 3301.0 451 3108.0 452 3802.0 453 3115.0 454 1402.0 455 3115.0 456 802.0 457 613.0 458 1402.0 459 614.0 460 613.0 461 3006.0 462 619.0 463 724.0 464 3006.0 465 614.0 466 3115.0 467 3831.0 468 802.0 469 2660.0 470 413.0 471 802.0 472 1402.0 473 3170.0 474 3301.0 475 3207.0 476 3006.0 477 2629.0 478 3006.0 479 3301.0 480 802.0 481 615.0 482 1849.0 483 1874.0 484 3006.0 485 2405.0 486 3831.0 487 3112.0 488 3201.0 489 1842.0 490 2610.0 491 802.0 492 2610.0 493 3301.0 494 3115.0 495 3501.0 496 3502.0 497 3831.0 498 3112.0 499 2647.0 500 1102.0 501 617.0 502 3114.0 503 3007.0 504 613.0 505 3115.0 506 2647.0 507 3301.0 508 3301.0 509 3114.0 510 619.0 511 3301.0 512 735.0 513 2905.0 514 2900.0 515 3115.0 516 3301.0 517 1402.0 518 612.0 519 3410.0 520 3115.0 521 560.0 522 3115.0 523 3831.0 524 619.0 525 3006.0 526 2647.0 527 381.0 528 2647.0 529 1849.0 530 613.0 531 3402.0 532 735.0 533 3115.0 534 1402.0 535 724.0 536 3831.0 537 802.0 538 724.0 539 3119.0 540 3115.0 541 3115.0 542 1810.0 543 1843.0 544 1815.0 545 1402.0 546 802.0 547 2647.0 548 614.0 549 613.0 550 3125.0 551 3831.0 552 3125.0 553 3410.0 554 802.0 555 3410.0 556 3802.0 557 3802.0 558 3410.0 559 3115.0 560 3006.0 561 3301.0 562 361.0 563 802.0 564 3810.0 565 2900.0 566 3301.0 567 3802.0 568 2905.0 569 1107.0 570 423.0 571 1402.0 572 3831.0 573 3301.0 574 3207.0 575 3207.0 576 3201.0 577 3114.0 578 3801.0 579 1402.0 580 522.0 581 3410.0 582 2101.0 583 2900.0 584 3410.0 585 3802.0 586 3115.0 587 413.0 588 3831.0 589 2610.0 590 612.0 591 3115.0 592 3301.0 593 3831.0 594 3410.0 595 3831.0 596 3410.0 597 3006.0 598 3115.0 599 802.0 600 3301.0 601 3301.0 602 3114.0 603 802.0 604 3802.0 605 3201.0 606 1106.0 607 3410.0 608 1849.0 609 3831.0 610 3831.0 611 413.0 612 1402.0 613 3831.0 614 3410.0 615 3301.0 616 3006.0 617 617.0 618 3201.0 619 3115.0 620 3201.0 621 3112.0 622 3301.0 623 3301.0 624 3410.0 625 3802.0 626 724.0 627 1402.0 628 614.0 629 2914.0 630 3831.0 631 3410.0 632 1402.0 633 3802.0 634 3831.0 635 613.0 636 619.0 637 615.0 638 3001.0 639 3410.0 640 3006.0 641 3410.0 642 3202.0 643 3006.0 644 3410.0 645 1402.0 646 3831.0 647 1402.0 648 3115.0 649 3410.0 650 3410.0 651 3410.0 652 724.0 653 3301.0 654 2007.0 655 3115.0 656 614.0 657 3410.0 658 3301.0 659 619.0 660 3410.0 661 724.0 662 3410.0 663 3114.0 664 3114.0 665 3410.0 666 3001.0 667 3006.0 668 3115.0 669 2610.0 670 724.0 671 3410.0 672 522.0 673 3006.0 674 3006.0 675 413.0 676 3114.0 677 3006.0 678 3006.0 679 1402.0 680 2629.0 681 2900.0 682 413.0 683 1503.0 684 3006.0 685 619.0 686 3114.0 687 3301.0 688 3115.0 689 301.0 690 413.0 691 3115.0 692 802.0 693 3802.0 694 361.0 695 802.0 696 3114.0 697 2403.0 698 619.0 699 3831.0 700 3114.0 701 3006.0 702 3301.0 703 2900.0 704 1402.0 705 3802.0 706 3831.0 707 2662.0 708 3006.0 709 3115.0 710 1846.0 711 1849.0 712 1841.0 713 3410.0 714 3001.0 715 3006.0 716 802.0 717 2629.0 718 619.0 719 522.0 720 520.0 721 3301.0 722 1402.0 723 1402.0 724 3502.0 725 802.0 726 802.0 727 2647.0 728 3119.0 729 3301.0 730 3202.0 731 2629.0 732 2905.0 733 3002.0 734 3301.0 735 613.0 736 619.0 737 3016.0 738 3006.0 739 619.0 740 3831.0 741 3115.0 742 3115.0 743 3114.0 744 3114.0 745 3201.0 746 3304.0 747 611.0 748 3803.0 749 2629.0 750 413.0 751 3006.0 752 2900.0 753 802.0 754 2914.0 755 3115.0 756 3831.0 757 560.0 758 3501.0 759 802.0 760 619.0 761 3410.0 762 1849.0 763 2647.0 764 3115.0 765 3831.0 766 423.0 767 1402.0 768 2647.0 769 3802.0 770 361.0 771 2622.0 772 423.0 773 3006.0 774 3114.0 775 3201.0 776 3802.0 777 2662.0 778 3831.0 779 2610.0 780 1102.0 781 2646.0 782 1849.0 783 3130.0 784 3006.0 785 1402.0 786 3301.0 787 801.0 788 1503.0 789 2407.0 790 2657.0 791 3801.0 792 3831.0 793 3801.0 794 3831.0 795 1849.0 796 3112.0 797 2610.0 798 3115.0 799 3114.0 800 3115.0 801 3115.0 802 1402.0 803 617.0 804 413.0 805 1849.0 806 3115.0 807 3831.0 808 3831.0 809 3810.0 810 1402.0 811 1831.0 812 3006.0 813 619.0 814 3006.0 815 3111.0 816 3115.0 817 3201.0 818 3831.0 819 3801.0 820 1810.0 821 1849.0 822 619.0 823 3115.0 824 617.0 825 3006.0 826 2647.0 827 2900.0 828 3301.0 829 2647.0 830 613.0 831 3410.0 832 706.0 833 802.0 834 3006.0 835 3006.0 836 614.0 837 3802.0 838 3006.0 839 619.0 840 3503.0 841 2900.0 842 3201.0 843 616.0 844 3115.0 845 1106.0 846 614.0 847 3410.0 848 3831.0 849 3201.0 850 1102.0 851 3201.0 852 617.0 853 3006.0 854 3115.0 855 1106.0 856 802.0 857 3119.0 858 3207.0 859 3115.0 860 706.0 861 3301.0 862 3831.0 863 311.0 864 1106.0 865 3410.0 866 2906.0 867 2629.0 868 413.0 869 802.0 870 3114.0 871 2905.0 872 2647.0 873 1109.0 874 802.0 875 2647.0 876 3301.0 877 724.0 878 2647.0 879 3002.0 880 3112.0 881 619.0 882 614.0 883 3115.0 884 614.0 885 3006.0 886 3006.0 887 3410.0 888 802.0 889 614.0 890 1402.0 891 3112.0 892 423.0 893 3006.0 894 3820.0 895 3301.0 896 3114.0 897 616.0 898 619.0 899 3201.0 900 613.0 901 3831.0 902 2647.0 903 3207.0 904 3801.0 905 3410.0 906 3301.0 907 3802.0 908 3803.0 909 3006.0 910 3115.0 911 619.0 912 3301.0 913 3831.0 914 1106.0 915 3114.0 916 3114.0 917 1402.0 918 724.0 919 3201.0 920 1402.0 921 706.0 922 616.0 923 3201.0 924 3301.0 925 3410.0 926 1402.0 927 3115.0 928 3410.0 929 3125.0 930 802.0 931 1402.0 932 3301.0 933 3125.0 934 2647.0 935 619.0 936 2900.0 937 3802.0 938 724.0 939 3802.0 940 3115.0 941 2646.0 942 2646.0 943 3125.0 944 3201.0 945 3006.0 946 3125.0 947 3114.0 948 2405.0 949 2647.0 950 2647.0 951 1402.0 952 3201.0 953 3831.0 954 706.0 955 3831.0 956 3410.0 957 1402.0 958 2646.0 959 3410.0 960 619.0 961 3115.0 962 3115.0 963 3410.0 964 3115.0 965 3301.0 966 3820.0 967 3831.0 968 802.0 969 1402.0 970 3410.0 971 1402.0 972 3001.0 973 3410.0 974 613.0 975 3803.0 976 3802.0 977 3109.0 978 614.0 979 3006.0 980 540.0 981 1402.0 982 3006.0 983 413.0 984 1402.0 985 3006.0 986 3006.0 987 3501.0 988 613.0 989 1402.0 990 3410.0 991 3006.0 992 802.0 993 1402.0 994 301.0 995 413.0 996 3115.0 997 3410.0 998 2906.0 999 3831.0 Name: OFFENSE_CODE, dtype: float64 Showing only the first 1000
df.cols.remove_special_chars("OFFENSE_CODE")
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) ~\Anaconda3\envs\python38\lib\site-packages\databricks\koalas\frame.py in __getattr__(self, key) 11826 try: > 11827 return self.loc[:, key] 11828 except KeyError: ~\Anaconda3\envs\python38\lib\site-packages\databricks\koalas\indexing.py in __getitem__(self, key) 456 series_name, --> 457 ) = self._select_cols(cols_sel) 458 ~\Anaconda3\envs\python38\lib\site-packages\databricks\koalas\indexing.py in _select_cols(self, cols_sel, missing_keys) 324 else: --> 325 return self._select_cols_else(cols_sel, missing_keys) 326 ~\Anaconda3\envs\python38\lib\site-packages\databricks\koalas\indexing.py in _select_cols_else(self, cols_sel, missing_keys) 1305 cols_sel = (cols_sel,) -> 1306 return self._get_from_multiindex_column(cols_sel, missing_keys) 1307 ~\Anaconda3\envs\python38\lib\site-packages\databricks\koalas\indexing.py in _get_from_multiindex_column(self, key, missing_keys, labels, recursed) 1145 if missing_keys is None: -> 1146 raise KeyError(k) 1147 else: KeyError: 'cols' During handling of the above exception, another exception occurred: AttributeError Traceback (most recent call last) C:\Users\ARGENI~1\AppData\Local\Temp/ipykernel_2660/4205330054.py in <module> ----> 1 df.cols.remove_special_chars("OFFENSE_CODE") ~\Documents\Optimus\examples\..\optimus\engines\base\columns.py in remove_special_chars(self, cols, output_cols) 2418 """ 2419 df = self.root -> 2420 return df.cols.replace(cols, [s for s in string.punctuation], "", "chars", output_cols=output_cols) 2421 2422 def to_datetime(self, cols="*", format=None, output_cols=None, transform_format=True) -> 'DataFrameType': ~\Documents\Optimus\examples\..\optimus\engines\spark\columns.py in replace(self, input_cols, search, replace_by, search_by, output_cols) 658 filter_dtype = [df.constants.NUMERIC_TYPES] 659 --> 660 columns = prepare_columns(df.data, input_cols, output_cols) 661 # columns = prepare_columns(df.data, input_cols, output_cols, filter_by_column_types=filter_dtype) 662 dfd = df.data ~\Documents\Optimus\examples\..\optimus\helpers\columns.py in prepare_columns(df, cols, output_cols, is_regex, filter_by_column_types, accepts_missing_cols, invert, default, cols_dict, auto_increment, args) 260 result = zip(*cols_dict) 261 else: --> 262 cols = parse_columns(df, cols, is_regex, filter_by_column_types, 263 accepts_missing_cols, invert) 264 merge = False ~\Documents\Optimus\examples\..\optimus\helpers\columns.py in parse_columns(df, cols_args, is_regex, filter_by_column_types, accepts_missing_cols, invert) 149 # if columns value is * get all dataframes columns 150 --> 151 df_columns = df.cols._names() 152 153 if is_regex is True: ~\Anaconda3\envs\python38\lib\site-packages\databricks\koalas\frame.py in __getattr__(self, key) 11827 return self.loc[:, key] 11828 except KeyError: > 11829 raise AttributeError( 11830 "'%s' object has no attribute '%s'" % (self.__class__.__name__, key) 11831 ) AttributeError: 'DataFrame' object has no attribute 'cols'
%%time
df.data["OFFENSE_DESCRIPTION"].value_counts()
Wall time: 73 ms
SICK/INJURED/MEDICAL - PERSON 18783 INVESTIGATE PERSON 18754 M/V - LEAVING SCENE - PROPERTY DAMAGE 16323 VANDALISM 15154 ASSAULT SIMPLE - BATTERY 14791 VERBAL DISPUTE 13099 TOWED MOTOR VEHICLE 11287 INVESTIGATE PROPERTY 11124 LARCENY THEFT FROM BUILDING 9069 THREATS TO DO BODILY HARM 9042 LARCENY THEFT FROM MV - NON-ACCESSORY 8893 PROPERTY - LOST 8893 WARRANT ARREST 8407 LARCENY SHOPLIFTING 7949 M/V ACCIDENT - PROPERTY DAMAGE 6557 LARCENY ALL OTHERS 5963 M/V ACCIDENT - PERSONAL INJURY 5131 ASSAULT - AGGRAVATED - BATTERY 4886 FRAUD - FALSE PRETENSE / SCHEME 4413 MISSING PERSON - LOCATED 4365 HARASSMENT 4007 MISSING PERSON 3766 PROPERTY - FOUND 3698 AUTO THEFT 3630 TRESPASSING 3254 FRAUD - CREDIT CARD / ATM FRAUD 3147 ROBBERY - STREET 3056 M/V ACCIDENT - OTHER 2925 ASSAULT - AGGRAVATED 2910 VAL - VIOLATION OF AUTO LAW - OTHER 2894 VAL - OPERATING AFTER REV/SUSP. 2618 DRUGS - POSS CLASS B - COCAINE, ETC. 2591 BURGLARY - RESIDENTIAL - FORCE 2585 SERVICE TO OTHER PD INSIDE OF MA. 2559 BURGLARY - RESIDENTIAL - NO FORCE 2358 LARCENY THEFT OF BICYCLE 2265 DRUGS - POSS CLASS B - INTENT TO MFR DIST DISP 2117 DRUGS - SALE / MANUFACTURING 1991 VAL - OPERATING WITHOUT LICENSE 1963 LARCENY THEFT OF MV PARTS & ACCESSORIES 1951 LICENSE PREMISE VIOLATION 1701 M/V ACCIDENT INVOLVING PEDESTRIAN - INJURY 1656 DRUGS - POSS CLASS A - HEROIN, ETC. 1514 FORGERY / COUNTERFEITING 1451 SICK/INJURED/MEDICAL - POLICE 1405 DISORDERLY CONDUCT 1397 SUDDEN DEATH 1347 VIOL. OF RESTRAINING ORDER W NO ARREST 1338 DRUGS - POSS CLASS A - INTENT TO MFR DIST DISP 1329 M/V - LEAVING SCENE - PERSONAL INJURY 1286 DRUGS - OTHER 1274 FIRE REPORT - HOUSE, BUILDING, ETC. 1269 DEATH INVESTIGATION 1215 DRUGS - SICK ASSIST - HEROIN 1205 FRAUD - IMPERSONATION 1065 VAL - OPERATING UNREG/UNINS �CAR 1051 RECOVERED - MV RECOVERED IN BOSTON (STOLEN OUTSIDE BOSTON) 1050 STOLEN PROPERTY - BUYING / RECEIVING / POSSESSING 1001 BALLISTICS EVIDENCE/FOUND 981 LANDLORD - TENANT SERVICE 968 SEARCH WARRANT 966 ASSAULT - SIMPLE 962 M/V ACCIDENT - INVOLVING �BICYCLE - INJURY 916 BURGLARY - COMMERICAL - FORCE 912 PROPERTY - ACCIDENTAL DAMAGE 912 WEAPON - FIREARM - CARRYING / POSSESSING, ETC 899 DRUGS - POSS CLASS D 897 DRUGS - POSS CLASS D - INTENT TO MFR DIST DISP 879 PROPERTY - MISSING 858 DISTURBING THE PEACE 837 AUTO THEFT - MOTORCYCLE / SCOOTER 815 LIQUOR - DRINKING IN PUBLIC 796 M/V ACCIDENT - POLICE VEHICLE 766 M/V ACCIDENT - OTHER CITY VEHICLE 732 FIREARM/WEAPON - FOUND OR CONFISCATED 688 BURGLARY - RESIDENTIAL - ATTEMPT 661 OTHER OFFENSE 644 ROBBERY - OTHER 635 WEAPON - OTHER - CARRYING / POSSESSING, ETC 623 NOISY PARTY/RADIO-NO ARREST 620 MISSING PERSON - NOT REPORTED - LOCATED 593 DRUGS - POSS CLASS E 586 ROBBERY - COMMERCIAL 554 FIRE REPORT - CAR, BRUSH, ETC. 525 M/V PLATES - LOST 493 OPERATING UNDER THE INFLUENCE ALCOHOL 484 VAL - OPERATING W/O AUTHORIZATION LAWFUL 463 DRUGS - POSS CLASS C 458 LARCENY PICK-POCKET 455 PROPERTY - STOLEN THEN RECOVERED 452 M/V ACCIDENT - INVOLVING BICYCLE - NO INJURY 426 M/V ACCIDENT - INVOLVING PEDESTRIAN - NO INJURY 414 EVADING FARE 407 AUTO THEFT - LEASED/RENTED VEHICLE 402 VIOLATION - CITY ORDINANCE 370 ANIMAL CONTROL - DOG BITES - ETC. 367 SUICIDE / SUICIDE ATTEMPT 356 DRUGS - SICK ASSIST - OTHER HARMFUL DRUG 337 FRAUD - WIRE 334 DRUGS - SICK ASSIST - OTHER NARCOTIC 311 INTIMIDATING WITNESS 309 BURGLARY - COMMERICAL - NO FORCE 308 DANGEROUS OR HAZARDOUS CONDITION 307 EMBEZZLEMENT 296 ANIMAL INCIDENTS 285 DEMONSTRATIONS/RIOT 279 VIOL. OF RESTRAINING ORDER W ARREST 268 GRAFFITI 261 CHILD ENDANGERMENT 256 LIQUOR LAW VIOLATION 256 BURGLARY - OTHER - FORCE 249 AFFRAY 249 WEAPON - FIREARM - OTHER VIOLATION 238 PROPERTY - LOST THEN LOCATED 227 POSSESSION OF BURGLARIOUS TOOLS 227 FUGITIVE FROM JUSTICE 220 HARBOR INCIDENT / VIOLATION 212 DRUGS - POSS CLASS C - INTENT TO MFR DIST DISP 205 DRUGS - POSSESSION 195 SERVICE TO OTHER PD OUTSIDE OF MA. 195 INVESTIGATION FOR ANOTHER AGENCY 194 EXTORTION OR BLACKMAIL 188 LARCENY PURSE SNATCH - NO FORCE 188 CHILD ENDANGERMENT (NO ASSAULT) 185 ROBBERY - BANK 181 DRUGS - CLASS B TRAFFICKING OVER 18 GRAMS 177 BURGLARY - OTHER - NO FORCE 171 REPORT AFFECTING OTHER DEPTS. 166 PROSTITUTION - SOLICITING 161 MURDER, NON-NEGLIGIENT MANSLAUGHTER 161 DRUGS - POSS CLASS E - INTENT TO MFR DIST DISP 154 WEAPON - OTHER - OTHER VIOLATION 147 DRUGS - CLASS A TRAFFICKING OVER 18 GRAMS 132 CRIMINAL HARASSMENT 131 FIRE REPORT/ALARM - FALSE 126 ANNOYING AND ACCOSTING 126 BURGLARY - COMMERICAL - ATTEMPT 112 ROBBERY - HOME INVASION 104 ARSON 94 OPERATING UNDER THE INFLUENCE DRUGS 90 ROBBERY - CAR JACKING 86 DRUGS - CONSP TO VIOL CONTROLLED SUBSTANCE 86 OBSCENE MATERIALS - PORNOGRAPHY 80 HOME INVASION 77 DRUGS - POSSESSION OF DRUG PARAPHANALIA 75 BOMB THREAT 75 RECOVERED STOLEN PLATE 67 ANIMAL ABUSE 64 STALKING 62 CHILD ABANDONMENT (NO ASSAULT) 59 INJURY BICYCLE NO M/V INVOLVED 57 VIOLATION - CITY ORDINANCE CONSTRUCTION PERMIT 51 KIDNAPPING/CUSTODIAL KIDNAPPING 48 KIDNAPPING - ENTICING OR ATTEMPTED 44 BURGLARY - OTHER - ATTEMPT 41 CHINS 36 AIRCRAFT INCIDENTS 36 OBSCENE PHONE CALLS 31 TRUANCY / RUNAWAY 31 NOISY PARTY/RADIO-ARREST 29 PROSTITUTION 29 PROTECTIVE CUSTODY / SAFEKEEPING 27 GATHERING CAUSING ANNOYANCE 27 PRISONER - SUICIDE / SUICIDE ATTEMPT 23 CONSPIRACY EXCEPT DRUG LAW 19 LARCENY THEFT FROM COIN-OP MACHINE 16 FRAUD - WELFARE 15 VIOLATION - HAWKER AND PEDDLER 15 EXPLOSIVES - TURNED IN OR FOUND 14 EXPLOSIVES - POSSESSION OR USE 13 WEAPON - FIREARM - SALE / TRAFFICKING 13 ABDUCTION - INTICING 12 PROSTITUTION - COMMON NIGHTWALKER 12 PROPERTY - CONCEALING LEASED 10 FIREARM/WEAPON - LOST 10 FIREARM/WEAPON - ACCIDENTAL INJURY / DEATH 10 PRISONER ESCAPE / ESCAPE & RECAPTURE 9 CUSTODIAL KIDNAPPING 9 CONTRIBUTING TO DELINQUENCY OF MINOR 8 GAMBLING - BETTING / WAGERING 8 ASSAULT & BATTERY 8 HUMAN TRAFFICKING - COMMERCIAL SEX ACTS 7 LARCENY OTHER $200 & OVER 7 DRUGS - CLASS D TRAFFICKING OVER 50 GRAMS 6 ASSAULT & BATTERY D/W - OTHER 6 LARCENY IN A BUILDING $200 & OVER 5 MANSLAUGHTER - VEHICLE - NEGLIGENCE 5 DRUGS - POSS CLASS A - HEROIN, ETC. 5 PROSTITUTION - ASSISTING OR PROMOTING 4 LARCENY SHOPLIFTING UNDER $50 4 LARCENY SHOPLIFTING $200 & OVER 4 LARCENY BICYCLE $200 & OVER 3 ROBBERY - UNARMED - STREET 3 A&B ON POLICE OFFICER 3 FIREARM/WEAPON - POSSESSION OF DANGEROUS 3 B&E NON-RESIDENCE DAY - NO FORCE 2 B&E NON-RESIDENCE DAY - FORCIBLE 2 PROPERTY - RECEIVING STOLEN 2 LARCENY SHOPLIFTING $50 TO $199 2 FORGERY OR UTTERING 2 DRUGS - POSS CLASS E INTENT TO MF DIST DISP 2 MANSLAUGHTER - NON-VEHICLE - NEGLIGENCE 2 LARCENY OTHER $50 TO $199 2 AUTO THEFT - RECOVERED IN BY POLICE 2 BIOLOGICAL THREATS 2 HUMAN TRAFFICKING - INVOLUNTARY SERVITUDE 2 DRUGS - POSS CLASS D - MARIJUANA, ETC. 1 ROBBERY - KNIFE - STREET 1 B&E NON-RESIDENCE DAY - NO PROP TAKEN 1 COUNTERFEITING 1 LARCENY OTHER UNDER $50 1 AUTO THEFT LEASE/RENT VEHICLE 1 LARCENY IN A BUILDING UNDER $50 1 A&B HANDS, FEET, ETC. - MED. ATTENTION REQ. 1 ROBBERY ATTEMPT - KNIFE - BANK 1 ROBBERY - UNARMED - CHAIN STORE 1 AUTO THEFT - OUTSIDE - RECOVERED IN BOSTON 1 KILLING OF FELON BY POLICE 1 ROBBERY - UNARMED - BUSINESS 1 PROSTITUTE - COMMON NIGHTWALKER 1 VIOLATION - RESTRAINING ORDER 1 ASSAULT D/W - KNIFE ON POLICE OFFICER 1 B&E NON-RESIDENCE NIGHT - FORCE 1 AUTO THEFT OTHER 1 B&E RESIDENCE NIGHT - ATTEMPT FORCE 1 DISORDERLY PERSON 1 DRUGS - GLUE INHALATION 1 FRAUD - FALSE PRETENSE 1 CHILD ABUSE 1 LARCENY NON-ACCESSORY FROM VEH. $200 & OVER 1 LARCENY IN A BUILDING $50 TO $199 1 B&E RESIDENCE DAY - NO PROP TAKEN 1 LARCENY NON-ACCESSORY FROM VEH. $50 TO $199 1 ASSAULT & BATTERY D/W - OTHER ON POLICE OFFICER 1 DRUGS - POSS CLASS D - INTENT MFR DIST DISP 1 ROBBERY - UNARMED - RESIDENCE 1 ASSAULT & BATTERY D/W - KNIFE 1 B&E RESIDENCE DAY - NO FORCE 1 FIREARM/WEAPON - CARRY - SELL - RENT 1 FRAUDS - ALL OTHER 1 ASSAULT D/W - OTHER 1 LARCENY NON-ACCESSORY FROM VEH. UNDER $50 1 ANNOYING AND ACCOSTIN 1 PRISONER ATTEMPT TO RESCUE 1 Name: OFFENSE_DESCRIPTION, dtype: int64
%%time
df.cols.frequency("OFFENSE_DESCRIPTION")
Wall time: 13.1 s
{'frequency': {'OFFENSE_DESCRIPTION': {'values': [{'value': 'SICK/INJURED/MEDICAL - PERSON', 'count': 18783}, {'value': 'INVESTIGATE PERSON', 'count': 18754}, {'value': 'M/V - LEAVING SCENE - PROPERTY DAMAGE', 'count': 16323}, {'value': 'VANDALISM', 'count': 15154}, {'value': 'ASSAULT SIMPLE - BATTERY', 'count': 14791}, {'value': 'VERBAL DISPUTE', 'count': 13099}, {'value': 'TOWED MOTOR VEHICLE', 'count': 11287}, {'value': 'INVESTIGATE PROPERTY', 'count': 11124}, {'value': 'LARCENY THEFT FROM BUILDING', 'count': 9069}, {'value': 'THREATS TO DO BODILY HARM', 'count': 9042}, {'value': 'LARCENY THEFT FROM MV - NON-ACCESSORY', 'count': 8893}, {'value': 'PROPERTY - LOST', 'count': 8893}, {'value': 'WARRANT ARREST', 'count': 8407}, {'value': 'LARCENY SHOPLIFTING', 'count': 7949}, {'value': 'M/V ACCIDENT - PROPERTY DAMAGE', 'count': 6557}, {'value': 'LARCENY ALL OTHERS', 'count': 5963}, {'value': 'M/V ACCIDENT - PERSONAL INJURY', 'count': 5131}, {'value': 'ASSAULT - AGGRAVATED - BATTERY', 'count': 4886}, {'value': 'FRAUD - FALSE PRETENSE / SCHEME', 'count': 4413}, {'value': 'MISSING PERSON - LOCATED', 'count': 4365}, {'value': 'HARASSMENT', 'count': 4007}, {'value': 'MISSING PERSON', 'count': 3766}, {'value': 'PROPERTY - FOUND', 'count': 3698}, {'value': 'AUTO THEFT', 'count': 3630}, {'value': 'TRESPASSING', 'count': 3254}, {'value': 'FRAUD - CREDIT CARD / ATM FRAUD', 'count': 3147}, {'value': 'ROBBERY - STREET', 'count': 3056}, {'value': 'M/V ACCIDENT - OTHER', 'count': 2925}, {'value': 'ASSAULT - AGGRAVATED', 'count': 2910}, {'value': 'VAL - VIOLATION OF AUTO LAW - OTHER', 'count': 2894}, {'value': 'VAL - OPERATING AFTER REV/SUSP.', 'count': 2618}, {'value': 'DRUGS - POSS CLASS B - COCAINE, ETC.', 'count': 2591}]}}}
df.cols.
from pyspark.sql.types import *
from datetime import date, datetime
cols = [
("names", "str"),
("height(ft)", ShortType()),
("function", "str"),
("rank", ByteType()),
("age", "int"),
("weight(t)", "float"),
"japanese name",
"last position seen",
"date arrival",
"last date seen",
("attributes", ArrayType(FloatType())),
("DateType", DateType()),
("timestamp", TimestampType()),
("Cybertronian", BooleanType()),
("function(binary)", BinaryType()),
("NullType", NullType())
]
rows = [
("argenisleon@gmail.com", 28, "Leader", 10, 5000000, 4.30, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10",
"2016/09/10", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray("Leader", "utf-8"),
None),
("bumbl#ebéé ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10",
"2015/08/10", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray("Espionage", "utf-8"),
None),
("ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10",
"2014/07/10", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray("Security", "utf-8"),
None),
("1 Megatron", 13, "First Lieutenant", 8, 5000000, 1.80, ["Meister"], "33.670666,-117.841553", "1980/04/10",
"2013/06/10", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,
bytearray("First Lieutenant", "utf-8"), None),
("1 Megatron", None, "None", 10, 5000000, 5.70, ["Megatron"], None, "1980/04/10", "2012/05/10", [None, 5700.0],
date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray("None", "utf-8"), None),
(None, 300, "Battle Station", 8, 5000000, None, ["Metroflex"], None, "1980/04/10", "2011/04/10",
[91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray("Battle Station", "utf-8"), None),
]
df = op.create.df(cols ,rows, False).cache().repartition(1)
df.ext.display(20)
names
1 (string)
not nullable
|
height(ft)
2 (smallint)
not nullable
|
function
3 (string)
not nullable
|
rank
4 (tinyint)
not nullable
|
age
5 (int)
not nullable
|
weight(t)
6 (float)
not nullable
|
japanese name
7 (string)
not nullable
|
last position seen
8 (string)
not nullable
|
date arrival
9 (string)
not nullable
|
last date seen
10 (string)
not nullable
|
attributes
11 (array<float>)
not nullable
|
DateType
12 (date)
not nullable
|
timestamp
13 (timestamp)
not nullable
|
Cybertronian
14 (boolean)
not nullable
|
function(binary)
15 (binary)
not nullable
|
NullType
16 (null)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
argenisleon@gmail.com
|
28.0
|
Leader
|
10
|
5000000
|
4.300000190734863
|
[Inochi,⋅Convoy]
|
19.442735,-99.201111
|
1980/04/10
|
2016/09/10
|
[8.53439998626709,⋅4300.0]
|
2016-09-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Leader')
|
None
|
bumbl#ebéé⋅⋅
|
17.0
|
Espionage
|
7
|
5000000
|
2.0
|
[Bumble,⋅Goldback]
|
10.642707,-71.612534
|
1980/04/10
|
2015/08/10
|
[5.334000110626221,⋅2000.0]
|
2015-08-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Espionage')
|
None
|
ironhide&
|
26.0
|
Security
|
7
|
5000000
|
4.0
|
[Roadbuster]
|
37.789563,-122.400356
|
1980/04/10
|
2014/07/10
|
[7.924799919128418,⋅4000.0]
|
2014-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Security')
|
None
|
1⋅Megatron
|
13.0
|
First⋅Lieutenant
|
8
|
5000000
|
1.7999999523162842
|
[Meister]
|
33.670666,-117.841553
|
1980/04/10
|
2013/06/10
|
[3.962399959564209,⋅1800.0]
|
2013-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'First⋅Lieutenant')
|
None
|
1⋅Megatron
|
nan
|
None
|
10
|
5000000
|
5.699999809265137
|
[Megatron]
|
None
|
1980/04/10
|
2012/05/10
|
[None,⋅5700.0]
|
2012-05-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'None')
|
None
|
None
|
300.0
|
Battle⋅Station
|
8
|
5000000
|
nan
|
[Metroflex]
|
None
|
1980/04/10
|
2011/04/10
|
[91.44000244140625,⋅None]
|
2011-04-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Battle⋅Station')
|
None
|
df.display()
INCIDENT_NUMBER
1 (object)
|
OFFENSE_CODE
2 (int64)
|
OFFENSE_CODE_GROUP
3 (object)
|
OFFENSE_DESCRIPTION
4 (object)
|
DISTRICT
5 (object)
|
REPORTING_AREA
6 (object)
|
SHOOTING
7 (object)
|
OCCURRED_ON_DATE
8 (object)
|
YEAR
9 (int64)
|
MONTH
10 (int64)
|
DAY_OF_WEEK
11 (object)
|
HOUR
12 (int64)
|
UCR_PART
13 (object)
|
STREET
14 (object)
|
Lat
15 (object)
|
Long
16 (object)
|
Location
17 (object)
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
619
|
Larceny
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sunday
|
13
|
Part⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
"(42.35779134
|
I182070943
|
1402
|
Vandalism
|
VANDALISM
|
C11
|
347
|
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesday
|
0
|
Part⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
"(42.30682138
|
I182070941
|
3410
|
Towed
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
"(42.34658879
|
I182070940
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
"(42.33418175
|
I182070938
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
"(42.27536542
|
I182070936
|
3820
|
Motor⋅Vehicle⋅Accident⋅Response
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
"(42.29019621
|
I182070933
|
724
|
Auto⋅Theft
|
AUTO⋅THEFT
|
B2
|
330
|
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Monday
|
21
|
Part⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.08273260
|
"(42.30607218
|
I182070932
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
B2
|
584
|
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
"(42.32701648
|
I182070931
|
301
|
Robbery
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Monday
|
20
|
Part⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
"(42.33152148
|
I182070929
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C11
|
364
|
|
2018-09-03⋅20:38:00
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
"(42.29514664
|
I182070928
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C6
|
913
|
|
2018-09-03⋅19:55:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
OCEAN⋅VIEW⋅DR
|
42.31957856
|
-71.04032766
|
"(42.31957856
|
df.cols.count_by_dtypes("*", infer=False)
{'SHOOTING': {'null': 318054, 'missing': 0, 'string': 1019}, 'MONTH': {'null': 0, 'missing': 0, 'int': 319073}, 'HOUR': {'null': 0, 'missing': 0, 'int': 319073}, 'Lat': {'null': 19999, 'missing': 0, 'decimal': 299074}, 'STREET': {'null': 10871, 'missing': 0, 'string': 308202}, 'DISTRICT': {'null': 1765, 'missing': 0, 'string': 317308}, 'OFFENSE_CODE_GROUP': {'null': 0, 'missing': 0, 'string': 319073}, 'REPORTING_AREA': {'null': 0, 'missing': 0, 'string': 319073}, 'OCCURRED_ON_DATE': {'null': 0, 'missing': 0, 'date': 319073}, 'UCR_PART': {'null': 90, 'missing': 0, 'string': 318983}, 'INCIDENT_NUMBER': {'null': 0, 'missing': 0, 'string': 319073}, 'DAY_OF_WEEK': {'null': 0, 'missing': 0, 'string': 319073}, 'OFFENSE_DESCRIPTION': {'null': 0, 'missing': 0, 'string': 319073}, 'YEAR': {'null': 0, 'missing': 0, 'int': 319073}, 'Long': {'null': 19999, 'missing': 0, 'decimal': 299074}, 'OFFENSE_CODE': {'null': 0, 'missing': 0, 'int': 319073}, 'Location': {'null': 0, 'missing': 0, 'string': 319073}}
from optimus.helpers.check import is_column_a
is_column_a(df,"OCCURRED_ON_DATE","timestamp")
print(df.cols.schema_dtype("OCCURRED_ON_DATE"))
TimestampType
df.dtypes
[('INCIDENT_NUMBER', 'string'), ('OFFENSE_CODE', 'int'), ('OFFENSE_CODE_GROUP', 'string'), ('OFFENSE_DESCRIPTION', 'string'), ('DISTRICT', 'string'), ('REPORTING_AREA', 'string'), ('SHOOTING', 'string'), ('OCCURRED_ON_DATE', 'timestamp'), ('YEAR', 'int'), ('MONTH', 'int'), ('DAY_OF_WEEK', 'string'), ('HOUR', 'int'), ('UCR_PART', 'string'), ('STREET', 'string'), ('Lat', 'double'), ('Long', 'double'), ('Location', 'string')]
df.cols.std("OCCURRED_ON_DATE")
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-36-c1c8bd292777> in <module> ----> 1 df.cols.std("OCCURRED_ON_DATE") ~\Documents\Optimus\optimus\engines\spark\columns.py in std(columns) 745 """ 746 columns = parse_columns(self, columns, filter_by_column_dtypes=self.constants.NUMERIC_TYPES) --> 747 check_column_numbers(columns, "*") 748 749 return format_dict(Cols.agg_exprs(columns, F.stddev)) ~\Documents\Optimus\optimus\helpers\columns.py in check_column_numbers(columns, number) 216 if columns is None: 217 RaiseIt.value_error(columns, ["str", "list"], --> 218 extra_text="Maybe the columns selected do not match a specified datatype filter.") 219 220 count = len(columns) ~\Documents\Optimus\optimus\helpers\raiseit.py in value_error(var, data_values, extra_text) 74 type=divisor.join(map( 75 lambda x: "'" + x + "'", ---> 76 data_values)), var_type=one_list_to_val(var), extra_text=extra_text)) 77 78 @staticmethod ValueError: 'columns' must be 'str' or 'list', received 'None'. Maybe the columns selected do not match a specified datatype filter.
df.ext.send("OCCURRED_ON_DATE")
df.cols.hist("*")
df.cols.hist("INCIDENT_NUMBER")
VVV StringType DATA (<class 'pyspark.sql.types.ByteType'>, <class 'pyspark.sql.types.ShortType'>, <class 'pyspark.sql.types.LongType'>, <class 'pyspark.sql.types.IntegerType'>, <class 'pyspark.sql.types.DoubleType'>, <class 'pyspark.sql.types.FloatType'>) VVV StringType DATA (<class 'pyspark.sql.types.StringType'>,) EXEC AGG 1
{'INCIDENT_NUMBER': {'hist': [{'count': 0.0, 'lower': 0.0, 'upper': 2.5}, {'count': 0.0, 'lower': 2.5, 'upper': 5.0}, {'count': 0.0, 'lower': 5.0, 'upper': 7.5}, {'count': 1.0, 'lower': 7.5, 'upper': 10.0}, {'count': 318719.0, 'lower': 10.0, 'upper': 12.5}, {'count': 353.0, 'lower': 12.5, 'upper': 15.0}, {'count': 0.0, 'lower': 15.0, 'upper': 17.5}, {'count': 0.0, 'lower': 17.5, 'upper': 20.0}, {'count': 0.0, 'lower': 20.0, 'upper': 22.5}, {'count': 0.0, 'lower': 22.5, 'upper': 25.0}, {'count': 0.0, 'lower': 25.0, 'upper': 27.5}, {'count': 0.0, 'lower': 27.5, 'upper': 30.0}, {'count': 0.0, 'lower': 30.0, 'upper': 32.5}, {'count': 0.0, 'lower': 32.5, 'upper': 35.0}, {'count': 0.0, 'lower': 35.0, 'upper': 37.5}, {'count': 0.0, 'lower': 37.5, 'upper': 40.0}, {'count': 0.0, 'lower': 40.0, 'upper': 42.5}, {'count': 0.0, 'lower': 42.5, 'upper': 45.0}, {'count': 0.0, 'lower': 45.0, 'upper': 47.5}, {'count': 0.0, 'lower': 47.5, 'upper': 50.0}]}}
df.outliers.tukey("height(ft)").select().ext.display()
ShortType (<class 'pyspark.sql.types.ByteType'>, <class 'pyspark.sql.types.ShortType'>, <class 'pyspark.sql.types.LongType'>, <class 'pyspark.sql.types.IntegerType'>, <class 'pyspark.sql.types.DoubleType'>, <class 'pyspark.sql.types.FloatType'>) 0.5 44.5 ShortType (<class 'pyspark.sql.types.ByteType'>, <class 'pyspark.sql.types.ShortType'>, <class 'pyspark.sql.types.LongType'>, <class 'pyspark.sql.types.IntegerType'>, <class 'pyspark.sql.types.DoubleType'>, <class 'pyspark.sql.types.FloatType'>)
names
1 (string)
nullable
|
height(ft)
2 (smallint)
nullable
|
function
3 (string)
nullable
|
rank
4 (tinyint)
nullable
|
age
5 (int)
nullable
|
weight(t)
6 (float)
nullable
|
japanese name
7 (string)
nullable
|
last position seen
8 (string)
nullable
|
date arrival
9 (string)
nullable
|
last date seen
10 (string)
nullable
|
attributes
11 (array<float>)
nullable
|
DateType
12 (date)
nullable
|
timestamp
13 (timestamp)
nullable
|
Cybertronian
14 (boolean)
nullable
|
function(binary)
15 (binary)
nullable
|
NullType
16 (null)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
None
|
300
|
Battle⋅Station
|
8
|
5000000
|
None
|
[Metroflex]
|
None
|
1980/04/10
|
2011/04/10
|
[91.44000244140625,⋅None]
|
2011-04-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Battle⋅Station')
|
None
|
outlier.hist("price")
'{"price": {"hist": [{"count": 6.0, "lower": 8.0, "upper": 8.1}, {"count": 0.0, "lower": 8.1, "upper": 8.2}, {"count": 0.0, "lower": 8.2, "upper": 8.3}, {"count": 0.0, "lower": 8.3, "upper": 8.4}, {"count": 0.0, "lower": 8.4, "upper": 8.5}, {"count": 0.0, "lower": 8.5, "upper": 8.6}, {"count": 0.0, "lower": 8.6, "upper": 8.7}, {"count": 0.0, "lower": 8.7, "upper": 8.8}, {"count": 0.0, "lower": 8.8, "upper": 8.9}, {"count": 0.0, "lower": 8.9, "upper": 9.0}, {"count": 2.0, "lower": 9.0, "upper": 9.1}, {"count": 0.0, "lower": 9.1, "upper": 9.2}, {"count": 0.0, "lower": 9.2, "upper": 9.3}, {"count": 0.0, "lower": 9.3, "upper": 9.4}, {"count": 0.0, "lower": 9.4, "upper": 9.5}, {"count": 0.0, "lower": 9.5, "upper": 9.6}, {"count": 0.0, "lower": 9.6, "upper": 9.7}, {"count": 0.0, "lower": 9.7, "upper": 9.8}, {"count": 0.0, "lower": 9.8, "upper": 9.9}, {"count": 0.0, "lower": 9.9, "upper": 10.0}]}}'
df.cols.count_by_dtypes("id")
{'id': {'null': 0, 'missing': 0, 'int': 19}}
df.count()
19
outlier.info()
6
{'count_outliers': 9, 'count_non_outliers': 10, 'lower_bound': 6, 'lower_bound_count': 9, 'upper_bound': 10, 'upper_bound_count': 0}
# df.table()
df.cols.count_mismatch({"names":"argenisleon@gmail.com","names":"email"})
{'names': {'email': 1, 'mismatch': 4, 'null': 1, 'missing': 0}}
a = {'names': {'email': 1, 'mismatch': 4, 'null': 1}}
tuple({"firstName":"string","lastName":"array"}.values())
('string', 'array')
from infer import Infer
from infer import Infer
Infer.mismatch(("names",None),{"names":"email"})
(('names', 'null'), 1)
Infer.value(12, "string")
list({"firstName":"string","lastName":"string"}.keys())
['firstName', 'lastName']
df.rows.select_by_dtypes("names","str")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-8-5a6988a57346> in <module> ----> 1 df.rows.select_by_dtypes("names","str") NameError: name 'df' is not defined
# Histograma
df.rows.between("height(ft)",17,26, invert = False , equal =True, ).table()
names
1 (string)
nullable
|
height(ft)
2 (smallint)
nullable
|
function
3 (string)
nullable
|
rank
4 (tinyint)
nullable
|
age
5 (int)
nullable
|
weight(t)
6 (float)
nullable
|
japanese name
7 (array<string>)
nullable
|
last position seen
8 (string)
nullable
|
date arrival
9 (string)
nullable
|
last date seen
10 (string)
nullable
|
attributes
11 (array<float>)
nullable
|
DateType
12 (date)
nullable
|
timestamp
13 (timestamp)
nullable
|
Cybertronian
14 (boolean)
nullable
|
function(binary)
15 (binary)
nullable
|
NullType
16 (null)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
bumbl#ebéé⋅⋅
|
17
|
Espionage
|
7
|
5000000
|
2.0
|
['Bumble',⋅'Goldback']
|
10.642707,-71.612534
|
1980/04/10
|
2015/08/10
|
[5.334000110626221,⋅2000.0]
|
2015-08-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Espionage')
|
None
|
ironhide&
|
26
|
Security
|
7
|
5000000
|
4.0
|
['Roadbuster']
|
37.789563,-122.400356
|
1980/04/10
|
2014/07/10
|
[7.924799919128418,⋅4000.0]
|
2014-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Security')
|
None
|
1⋅Megatron
|
13
|
First⋅Lieutenant
|
8
|
5000000
|
1.7999999523162842
|
['Meister']
|
33.670666,-117.841553
|
1980/04/10
|
2013/06/10
|
[3.962399959564209,⋅1800.0]
|
2013-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'First⋅Lieutenant')
|
None
|
df.cols.reverse("function").table()
names
1 (string)
nullable
|
height(ft)
2 (smallint)
nullable
|
function
3 (string)
nullable
|
rank
4 (tinyint)
nullable
|
age
5 (int)
nullable
|
weight(t)
6 (float)
nullable
|
japanese name
7 (array<string>)
nullable
|
last position seen
8 (string)
nullable
|
date arrival
9 (string)
nullable
|
last date seen
10 (string)
nullable
|
attributes
11 (array<float>)
nullable
|
DateType
12 (date)
nullable
|
timestamp
13 (timestamp)
nullable
|
Cybertronian
14 (boolean)
nullable
|
function(binary)
15 (binary)
nullable
|
NullType
16 (null)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Optimus⋅OptimusPrime
|
28
|
redaeL
|
10
|
5000000
|
4.300000190734863
|
['Inochi',⋅'Convoy']
|
19.442735,-99.201111
|
1980/04/10
|
2016/09/10
|
[8.53439998626709,⋅4300.0]
|
2016-09-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Leader')
|
None
|
bumbl#ebéé⋅⋅
|
17
|
eganoipsE
|
7
|
5000000
|
2.0
|
['Bumble',⋅'Goldback']
|
10.642707,-71.612534
|
1980/04/10
|
2015/08/10
|
[5.334000110626221,⋅2000.0]
|
2015-08-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Espionage')
|
None
|
ironhide&
|
26
|
ytiruceS
|
7
|
5000000
|
4.0
|
['Roadbuster']
|
37.789563,-122.400356
|
1980/04/10
|
2014/07/10
|
[7.924799919128418,⋅4000.0]
|
2014-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Security')
|
None
|
1⋅Megatron
|
13
|
tnanetueiL⋅tsriF
|
8
|
5000000
|
1.7999999523162842
|
['Meister']
|
33.670666,-117.841553
|
1980/04/10
|
2013/06/10
|
[3.962399959564209,⋅1800.0]
|
2013-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'First⋅Lieutenant')
|
None
|
1⋅Megatron
|
None
|
enoN
|
10
|
5000000
|
5.699999809265137
|
['Megatron']
|
None
|
1980/04/10
|
2012/05/10
|
[None,⋅5700.0]
|
2012-05-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'None')
|
None
|
megatron⋅1
|
300
|
noitatS⋅elttaB
|
8
|
5000000
|
None
|
['Metroflex']
|
None
|
1980/04/10
|
2011/04/10
|
[91.44000244140625,⋅None]
|
2011-04-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Battle⋅Station')
|
None
|
outlier = df.outliers.tukey("mass (g)")
# print(outlier.info())
outlier.select_lower_bound()
'{"columns": [{"title": "mass (g)"}], "value": [[21.0], [160.0], [252.0], [256.8], [320.0], [41.0], [94.2], [265.0], [146.0], [134.0], [345.0], [14.0], [23.2], [17.0], [375.0], [270.0], [13.9], [18.0], [100.0], [488.1], [470.0], [67.8], [56.0], [190.0], [219.0], [324.0], [357.0], [212.0], [478.0], [342.0], [8.0], [94.0], [45.6], [0.5], [72.0], [367.0], [303.0], [48.6], [469.0], [78.4], [167.0], [100.0], [340.0], [28.0], [0.8], [230.0], [400.0], [438.0], [230.0], [30.0], [300.0], [188.0], [127.0], [277.0], [113.0], [107.2], [380.0], [82.0], [220.0], [240.0], [132.7], [36.1], [28.0], [380.0], [102.0], [480.0], [45.5], [215.0], [288.0], [28.0], [0.2], [315.0], [414.0], [167.7], [305.5], [180.0], [266.1], [112.0], [22.0], [450.0], [222.0], [100.0], [30.0], [483.0], [89.0], [230.0], [350.0], [448.0], [299.0], [400.0], [180.0], [450.0], [100.0], [331.0], [195.0], [140.0], [67.4], [97.7], [202.6], [136.0]]}'
keyCol.fingerprint(df,"product").table()
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (int)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
product***FINGERPRINT
9 (string)
nullable
|
---|---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$%!
|
123
|
Cake
|
10
|
1980/07/07
|
never
|
cake
|
2
|
André
|
Ampère
|
423
|
piza
|
8
|
1950/07/08
|
gonna
|
piza
|
3
|
NiELS
|
Böhr//((%%
|
551
|
pizza
|
8
|
1990/07/09
|
give
|
pizza
|
4
|
PAUL
|
dirac$
|
521
|
pizza
|
8
|
1954/07/10
|
you
|
pizza
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
8
|
1990/07/11
|
up
|
pizza
|
6
|
Galileo
|
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
|
672
|
arepa
|
5
|
1930/08/12
|
never
|
arepa
|
7
|
CaRL
|
Ga%%%uss
|
323
|
taco
|
3
|
1970/07/13
|
gonna
|
taco
|
8
|
David
|
H$$$ilbert
|
624
|
taaaccoo
|
3
|
1950/07/14
|
let
|
taaaccoo
|
9
|
Johannes
|
KEPLER
|
735
|
taco
|
3
|
1920/04/22
|
you
|
taco
|
10
|
JaMES
|
M$$ax%%well
|
875
|
taco
|
3
|
1923/03/12
|
down
|
taco
|
keyCol.fingerprint(df,"names").table()
names
1 (string)
nullable
|
height(ft)
2 (smallint)
nullable
|
function
3 (string)
nullable
|
rank
4 (tinyint)
nullable
|
age
5 (int)
nullable
|
weight(t)
6 (float)
nullable
|
japanese name
7 (array<string>)
nullable
|
last position seen
8 (string)
nullable
|
date arrival
9 (string)
nullable
|
last date seen
10 (string)
nullable
|
attributes
11 (array<float>)
nullable
|
DateType
12 (date)
nullable
|
timestamp
13 (timestamp)
nullable
|
Cybertronian
14 (boolean)
nullable
|
function(binary)
15 (binary)
nullable
|
NullType
16 (null)
nullable
|
names***FINGERPRINT
17 (string)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Optimus⋅OptimusPrime
|
28
|
Leader
|
10
|
5000000
|
4.300000190734863
|
['Inochi',⋅'Convoy']
|
19.442735,-99.201111
|
1980/04/10
|
2016/09/10
|
[8.53439998626709,⋅4300.0]
|
2016-09-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Leader')
|
None
|
optimusoptimusprime
|
bumbl#ebéé⋅⋅
|
17
|
Espionage
|
7
|
5000000
|
2.0
|
['Bumble',⋅'Goldback']
|
10.642707,-71.612534
|
1980/04/10
|
2015/08/10
|
[5.334000110626221,⋅2000.0]
|
2015-08-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Espionage')
|
None
|
bumblebee
|
ironhide&
|
26
|
Security
|
7
|
5000000
|
4.0
|
['Roadbuster']
|
37.789563,-122.400356
|
1980/04/10
|
2014/07/10
|
[7.924799919128418,⋅4000.0]
|
2014-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Security')
|
None
|
ironhide
|
1⋅Megatron
|
13
|
First⋅Lieutenant
|
8
|
5000000
|
1.7999999523162842
|
['Meister']
|
33.670666,-117.841553
|
1980/04/10
|
2013/06/10
|
[3.962399959564209,⋅1800.0]
|
2013-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'First⋅Lieutenant')
|
None
|
1megatron
|
1⋅Megatron
|
None
|
None
|
10
|
5000000
|
5.699999809265137
|
['Megatron']
|
None
|
1980/04/10
|
2012/05/10
|
[None,⋅5700.0]
|
2012-05-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'None')
|
None
|
1megatron
|
megatron⋅1
|
300
|
Battle⋅Station
|
8
|
5000000
|
None
|
['Metroflex']
|
None
|
1980/04/10
|
2011/04/10
|
[91.44000244140625,⋅None]
|
2011-04-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Battle⋅Station')
|
None
|
1megatron
|
keyCol.fingerprint_cluster(df,"product", output="json")
'{"taaaccoo": {"similar": {"taaaccoo": 1}, "count": 1, "sum": 1}, "piza": {"similar": {"piza": 1}, "count": 1, "sum": 1}, "hamburguer": {"similar": {"hamburguer": 1}, "count": 1, "sum": 1}, "taco": {"similar": {"taco": 3}, "count": 1, "sum": 3}, "pizzza": {"similar": {"pizzza": 1}, "count": 1, "sum": 1}, "arepa": {"similar": {"arepa": 1}, "count": 1, "sum": 1}, "pizza": {"similar": {"pizza": 4}, "count": 1, "sum": 4}, "Rice": {"similar": {"Rice": 1}, "count": 1, "sum": 1}, "110790": {"similar": {"110790": 1}, "count": 1, "sum": 1}, "BEER": {"similar": {"BEER": 1}, "count": 1, "sum": 1}, "Cake": {"similar": {"Cake": 1}, "count": 1, "sum": 1}, "null": {"similar": {"null": 1}, "count": 1, "sum": 1}, "pasta": {"similar": {"pasta": 2}, "count": 1, "sum": 2}}'
keyCol.n_gram_fingerprint_cluster(df,"product", output="json",n_size=2)
'{"arepa": {"similar": {"arepa": 1}, "count": 1, "sum": 1}, "taaaccoo": {"similar": {"taaaccoo": 1}, "count": 1, "sum": 1}, "pasta": {"similar": {"pasta": 2}, "count": 1, "sum": 2}, "pizza": {"similar": {"pizzza": 1, "pizza": 4}, "count": 2, "sum": 5}, "110790": {"similar": {"110790": 1}, "count": 1, "sum": 1}, "hamburguer": {"similar": {"hamburguer": 1}, "count": 1, "sum": 1}, "taco": {"similar": {"taco": 3}, "count": 1, "sum": 3}, "Cake": {"similar": {"Cake": 1}, "count": 1, "sum": 1}, "Rice": {"similar": {"Rice": 1}, "count": 1, "sum": 1}, "piza": {"similar": {"piza": 1}, "count": 1, "sum": 1}, "null": {"similar": {"null": 1}, "count": 1, "sum": 1}, "BEER": {"similar": {"BEER": 1}, "count": 1, "sum": 1}}'
from optimus.ml import keycollision as keyCol
from optimus.ml import distancecluster as dc
dc.levenshtein_cluster(df,"product", output="json")
'{"taaaccoo": {"similar": {"taco": 3, "taaaccoo": 1}, "count": 2, "sum": 4}, "piza": {"similar": {"pizza": 4, "piza": 1}, "count": 2, "sum": 5}, "hamburguer": {"similar": {"BEER": 1, "hamburguer": 1}, "count": 2, "sum": 2}, "taco": {"similar": {"Cake": 1, "Rice": 1, "taco": 3}, "count": 3, "sum": 5}, "pizzza": {"similar": {"pizza": 4, "pizzza": 1}, "count": 2, "sum": 5}, "arepa": {"similar": {"BEER": 1, "piza": 1, "pasta": 2, "Cake": 1, "Rice": 1, "pizza": 4, "arepa": 1}, "count": 7, "sum": 11}, "pizza": {"similar": {"piza": 1, "pizzza": 1, "pizza": 4}, "count": 3, "sum": 6}, "Rice": {"similar": {"piza": 1, "Cake": 1, "taco": 3, "Rice": 1}, "count": 4, "sum": 6}, "110790": {"similar": {"arepa": 1, "BEER": 1, "piza": 1, "pizzza": 1, "pasta": 2, "Cake": 1, "null": 1, "Rice": 1, "pizza": 4, "taco": 3, "110790": 1}, "count": 11, "sum": 17}, "BEER": {"similar": {"arepa": 1, "piza": 1, "Cake": 1, "null": 1, "Rice": 1, "taco": 3, "BEER": 1}, "count": 7, "sum": 9}, "Cake": {"similar": {"Rice": 1, "taco": 3, "Cake": 1}, "count": 3, "sum": 5}, "null": {"similar": {"BEER": 1, "piza": 1, "Cake": 1, "Rice": 1, "taco": 3, "null": 1}, "count": 6, "sum": 8}, "pasta": {"similar": {"piza": 1, "pizza": 4, "pasta": 2}, "count": 3, "sum": 7}}'
keyCol.n_gram_fingerprint_cluster(df,"names", n_size=1,output="json")
count
1 (string)
not nullable
|
names
2 (string)
nullable
|
names***NGRAM
3 (array<string>)
not nullable
|
names***NGRAM_FINGERPRINT
4 (string)
nullable
|
---|---|---|---|
1
|
bumbl#ebéé⋅⋅
|
['bumblebee']
|
bumblebee
|
1
|
ironhide&
|
['ironhide']
|
ironhide
|
1
|
Megatron2
|
['megatron2']
|
megatron2
|
1
|
Optimus⋅OptimusPrime
|
['optimusoptimusprime']
|
optimusoptimusprime
|
1
|
Megatron1
|
['megatron1']
|
megatron1
|
1
|
Megatron
|
['megatron']
|
megatron
|
'{"ironhide&": {"similar": {"ironhide&": 1}, "count": 1, "sum": 1.0}, "Megatron1": {"similar": {"Megatron1": 1}, "count": 1, "sum": 1.0}, "Optimus OptimusPrime": {"similar": {"Optimus OptimusPrime": 1}, "count": 1, "sum": 1.0}, "Megatron": {"similar": {"Megatron": 1}, "count": 1, "sum": 1.0}, "bumbl#eb\\u00e9\\u00e9 ": {"similar": {"bumbl#eb\\u00e9\\u00e9 ": 1}, "count": 1, "sum": 1.0}, "Megatron2": {"similar": {"Megatron2": 1}, "count": 1, "sum": 1.0}}'
df.table()
names
1 (string)
nullable
|
height(ft)
2 (smallint)
nullable
|
function
3 (string)
nullable
|
rank
4 (tinyint)
nullable
|
age
5 (int)
nullable
|
weight(t)
6 (float)
nullable
|
japanese name
7 (array<string>)
nullable
|
last position seen
8 (string)
nullable
|
date arrival
9 (string)
nullable
|
last date seen
10 (string)
nullable
|
attributes
11 (array<float>)
nullable
|
DateType
12 (date)
nullable
|
timestamp
13 (timestamp)
nullable
|
Cybertronian
14 (boolean)
nullable
|
function(binary)
15 (binary)
nullable
|
NullType
16 (null)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Optimus⋅OptimusPrime
|
28
|
Leader
|
10
|
5000000
|
4.300000190734863
|
['Inochi',⋅'Convoy']
|
19.442735,-99.201111
|
1980/04/10
|
2016/09/10
|
[8.53439998626709,⋅4300.0]
|
2016-09-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Leader')
|
None
|
bumbl#ebéé⋅⋅
|
17
|
Espionage
|
7
|
5000000
|
2.0
|
['Bumble',⋅'Goldback']
|
10.642707,-71.612534
|
1980/04/10
|
2015/08/10
|
[5.334000110626221,⋅2000.0]
|
2015-08-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Espionage')
|
None
|
ironhide&
|
26
|
Security
|
7
|
5000000
|
4.0
|
['Roadbuster']
|
37.789563,-122.400356
|
1980/04/10
|
2014/07/10
|
[7.924799919128418,⋅4000.0]
|
2014-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Security')
|
None
|
Megatron1
|
13
|
First⋅Lieutenant
|
8
|
5000000
|
1.7999999523162842
|
['Meister']
|
33.670666,-117.841553
|
1980/04/10
|
2013/06/10
|
[3.962399959564209,⋅1800.0]
|
2013-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'First⋅Lieutenant')
|
None
|
Megatron
|
None
|
None
|
10
|
5000000
|
5.699999809265137
|
['Megatron']
|
None
|
1980/04/10
|
2012/05/10
|
[None,⋅5700.0]
|
2012-05-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'None')
|
None
|
megatron
|
300
|
Battle⋅Station
|
8
|
5000000
|
None
|
['Metroflex']
|
None
|
1980/04/10
|
2011/04/10
|
[91.44000244140625,⋅None]
|
2011-04-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Battle⋅Station')
|
None
|
# df = op.load.csv("data/foo.csv", sep=",", header='true', infer_schema='true', charset="UTF-8", null_value="None")
df.table()
names
1 (string)
nullable
|
height(ft)
2 (smallint)
nullable
|
function
3 (string)
nullable
|
rank
4 (tinyint)
nullable
|
age
5 (int)
nullable
|
weight(t)
6 (float)
nullable
|
japanese name
7 (array<string>)
nullable
|
last position seen
8 (string)
nullable
|
date arrival
9 (string)
nullable
|
last date seen
10 (string)
nullable
|
attributes
11 (array<float>)
nullable
|
DateType
12 (date)
nullable
|
timestamp
13 (timestamp)
nullable
|
Cybertronian
14 (boolean)
nullable
|
function(binary)
15 (binary)
nullable
|
NullType
16 (null)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Optimus⋅OptimusPrime
|
28
|
Leader
|
10
|
5000000
|
4.300000190734863
|
['Inochi',⋅'Convoy']
|
19.442735,-99.201111
|
1980/04/10
|
2016/09/10
|
[8.53439998626709,⋅4300.0]
|
2016-09-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Leader')
|
None
|
bumbl#ebéé⋅⋅
|
17
|
Espionage
|
7
|
5000000
|
2.0
|
['Bumble',⋅'Goldback']
|
10.642707,-71.612534
|
1980/04/10
|
2015/08/10
|
[5.334000110626221,⋅2000.0]
|
2015-08-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Espionage')
|
None
|
ironhide&
|
26
|
Security
|
7
|
5000000
|
4.0
|
['Roadbuster']
|
37.789563,-122.400356
|
1980/04/10
|
2014/07/10
|
[7.924799919128418,⋅4000.0]
|
2014-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Security')
|
None
|
JaJa⋅JaJaJ
|
13
|
First⋅Lieutenant
|
8
|
5000000
|
1.7999999523162842
|
['Meister']
|
33.670666,-117.841553
|
1980/04/10
|
2013/06/10
|
[3.962399959564209,⋅1800.0]
|
2013-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'First⋅Lieutenant')
|
None
|
Megatron
|
None
|
None
|
10
|
5000000
|
5.699999809265137
|
['Megatron']
|
None
|
1980/04/10
|
2012/05/10
|
[None,⋅5700.0]
|
2012-05-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'None')
|
None
|
Metroplex_)^$
|
300
|
Battle⋅Station
|
8
|
5000000
|
None
|
['Metroflex']
|
None
|
1980/04/10
|
2011/04/10
|
[91.44000244140625,⋅None]
|
2011-04-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Battle⋅Station')
|
None
|
df.cols.replace("names",["JaJa","bbb"],"aaa",search_by="words").table()
names
1 (string)
nullable
|
height(ft)
2 (smallint)
nullable
|
function
3 (string)
nullable
|
rank
4 (tinyint)
nullable
|
age
5 (int)
nullable
|
weight(t)
6 (float)
nullable
|
japanese name
7 (array<string>)
nullable
|
last position seen
8 (string)
nullable
|
date arrival
9 (string)
nullable
|
last date seen
10 (string)
nullable
|
attributes
11 (array<float>)
nullable
|
DateType
12 (date)
nullable
|
timestamp
13 (timestamp)
nullable
|
Cybertronian
14 (boolean)
nullable
|
function(binary)
15 (binary)
nullable
|
NullType
16 (null)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Optimus⋅OptimusPrime
|
28
|
Leader
|
10
|
5000000
|
4.300000190734863
|
['Inochi',⋅'Convoy']
|
19.442735,-99.201111
|
1980/04/10
|
2016/09/10
|
[8.53439998626709,⋅4300.0]
|
2016-09-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Leader')
|
None
|
bumbl#ebéé⋅⋅
|
17
|
Espionage
|
7
|
5000000
|
2.0
|
['Bumble',⋅'Goldback']
|
10.642707,-71.612534
|
1980/04/10
|
2015/08/10
|
[5.334000110626221,⋅2000.0]
|
2015-08-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Espionage')
|
None
|
ironhide&
|
26
|
Security
|
7
|
5000000
|
4.0
|
['Roadbuster']
|
37.789563,-122.400356
|
1980/04/10
|
2014/07/10
|
[7.924799919128418,⋅4000.0]
|
2014-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Security')
|
None
|
aaa⋅JaJaJ
|
13
|
First⋅Lieutenant
|
8
|
5000000
|
1.7999999523162842
|
['Meister']
|
33.670666,-117.841553
|
1980/04/10
|
2013/06/10
|
[3.962399959564209,⋅1800.0]
|
2013-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'First⋅Lieutenant')
|
None
|
Megatron
|
None
|
None
|
10
|
5000000
|
5.699999809265137
|
['Megatron']
|
None
|
1980/04/10
|
2012/05/10
|
[None,⋅5700.0]
|
2012-05-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'None')
|
None
|
Metroplex_)^$
|
300
|
Battle⋅Station
|
8
|
5000000
|
None
|
['Metroflex']
|
None
|
1980/04/10
|
2011/04/10
|
[91.44000244140625,⋅None]
|
2011-04-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Battle⋅Station')
|
None
|
df.send()
Send!
df.table(20)
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (int)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$%!
|
123
|
Cake
|
10
|
1980/07/07
|
never
|
2
|
André
|
Ampère
|
423
|
piza
|
8
|
1950/07/08
|
gonna
|
3
|
NiELS
|
Böhr//((%%
|
551
|
pizza
|
8
|
1990/07/09
|
give
|
4
|
PAUL
|
dirac$
|
521
|
pizza
|
8
|
1954/07/10
|
you
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
8
|
1990/07/11
|
up
|
6
|
Galileo
|
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
|
672
|
arepa
|
5
|
1930/08/12
|
never
|
7
|
CaRL
|
Ga%%%uss
|
323
|
taco
|
3
|
1970/07/13
|
gonna
|
8
|
David
|
H$$$ilbert
|
624
|
taaaccoo
|
3
|
1950/07/14
|
let
|
9
|
Johannes
|
KEPLER
|
735
|
taco
|
3
|
1920/04/22
|
you
|
10
|
JaMES
|
M$$ax%%well
|
875
|
taco
|
3
|
1923/03/12
|
down
|
11
|
Isaac
|
Newton
|
992
|
pasta
|
9
|
1999/02/15
|
never⋅
|
12
|
Emmy%%
|
Nöether$
|
234
|
pasta
|
9
|
1993/12/08
|
gonna
|
13
|
Max!!!
|
Planck!!!
|
111
|
hamburguer
|
4
|
1994/01/04
|
run⋅
|
14
|
Fred
|
Hoy&&&le
|
553
|
pizzza
|
8
|
1997/06/27
|
around
|
15
|
(((⋅⋅⋅Heinrich⋅)))))
|
Hertz
|
116
|
pizza
|
8
|
1956/11/30
|
and
|
16
|
William
|
Gilbert###
|
886
|
BEER
|
2
|
1958/03/26
|
desert
|
17
|
Marie
|
CURIE
|
912
|
Rice
|
1
|
2000/03/22
|
you
|
18
|
Arthur
|
COM%%%pton
|
812
|
110790
|
5
|
1899/01/01
|
#
|
19
|
JAMES
|
Chadwick
|
467
|
null
|
10
|
1921/05/03
|
#
|
df.outliers.z_score("price",threshold =1).info()
{'count_outliers': 8, 'count_non_outliers': 11, 'max_z_score': 1.7111}
df.outliers.tukey("price").info()
{'count_outliers': 0, 'count_non_outliers': 19, 'lower_bound': -4.5, 'lower_bound_count': 0, 'upper_bound': 15.5, 'upper_bound_count': 0, 'iqr1': 3, 'iqr3': 8}
df.outliers.mad("price", threshold =1).info()
{'count_outliers': 9, 'count_non_outliers': 19, 'lower_bound': 6, 'lower_bound_count': 9, 'upper_bound': 10, 'upper_bound_count': 0}
df.outliers.modified_z_score("price",threshold =1).info()
{'count_outliers': 19, 'count_non_outliers': 19, 'max_m_z_score': 2.36075}
%%time
from optimus.ml import distancecluster as dc
print(dc.levenshtein_cluster(df,'product',output="json"))
{"taaaccoo": {"similar": {"taco": 3, "taaaccoo": 1}, "count": 2, "sum": 4}, "piza": {"similar": {"pizza": 4, "piza": 1}, "count": 2, "sum": 5}, "hamburguer": {"similar": {"BEER": 1, "hamburguer": 1}, "count": 2, "sum": 2}, "taco": {"similar": {"Cake": 1, "Rice": 1, "taco": 3}, "count": 3, "sum": 5}, "pizzza": {"similar": {"pizza": 4, "pizzza": 1}, "count": 2, "sum": 5}, "arepa": {"similar": {"BEER": 1, "piza": 1, "pasta": 2, "Cake": 1, "Rice": 1, "pizza": 4, "arepa": 1}, "count": 7, "sum": 11}, "pizza": {"similar": {"piza": 1, "pizzza": 1, "pizza": 4}, "count": 3, "sum": 6}, "Rice": {"similar": {"piza": 1, "Cake": 1, "taco": 3, "Rice": 1}, "count": 4, "sum": 6}, "110790": {"similar": {"arepa": 1, "BEER": 1, "piza": 1, "pizzza": 1, "pasta": 2, "Cake": 1, "null": 1, "Rice": 1, "pizza": 4, "taco": 3, "110790": 1}, "count": 11, "sum": 17}, "BEER": {"similar": {"arepa": 1, "piza": 1, "Cake": 1, "null": 1, "Rice": 1, "taco": 3, "BEER": 1}, "count": 7, "sum": 9}, "Cake": {"similar": {"Rice": 1, "taco": 3, "Cake": 1}, "count": 3, "sum": 5}, "null": {"similar": {"BEER": 1, "piza": 1, "Cake": 1, "Rice": 1, "taco": 3, "null": 1}, "count": 6, "sum": 8}, "pasta": {"similar": {"piza": 1, "pizza": 4, "pasta": 2}, "count": 3, "sum": 7}} Wall time: 9.6 s
from optimus.ml import distancecluster as dc
from optimus.ml import keycollision as kc
# result = dc.levenshtein_json(df,'product')
result = kc.fingerprint_cluster(df, "product",3)
result = kc.n_gram_fingerprint_cluster(df, "product",3)
count
1 (string)
not nullable
|
product
2 (string)
nullable
|
product***NGRAM
3 (array<string>)
not nullable
|
product***NGRAM_FINGERPRINT
4 (string)
nullable
|
---|---|---|---|
1
|
taaaccoo
|
['taaaccoo']
|
taaaccoo
|
1
|
piza
|
['piza']
|
piza
|
1
|
hamburguer
|
['hamburguer']
|
hamburguer
|
3
|
taco
|
['taco']
|
taco
|
1
|
BEER
|
['beer']
|
beer
|
1
|
pizzza
|
['pizzza']
|
pizzza
|
1
|
arepa
|
['arepa']
|
arepa
|
4
|
pizza
|
['pizza']
|
pizza
|
1
|
Rice
|
['rice']
|
rice
|
1
|
110790
|
['110790']
|
110790
|
print(result)
{'taaaccoo': {'similar': ['taaaccoo'], 'count': 1, 'sum': 1.0}, 'piza': {'similar': ['piza'], 'count': 1, 'sum': 1.0}, 'hamburguer': {'similar': ['hamburguer'], 'count': 1, 'sum': 1.0}, 'taco': {'similar': ['taco'], 'count': 1, 'sum': 3.0}, 'pizzza': {'similar': ['pizzza'], 'count': 1, 'sum': 1.0}, 'arepa': {'similar': ['arepa'], 'count': 1, 'sum': 1.0}, 'pizza': {'similar': ['pizza'], 'count': 1, 'sum': 4.0}, 'Rice': {'similar': ['Rice'], 'count': 1, 'sum': 1.0}, '110790': {'similar': ['110790'], 'count': 1, 'sum': 1.0}, 'BEER': {'similar': ['BEER'], 'count': 1, 'sum': 1.0}, 'Cake': {'similar': ['Cake'], 'count': 1, 'sum': 1.0}, 'null': {'similar': ['null'], 'count': 1, 'sum': 1.0}, 'pasta': {'similar': ['pasta'], 'count': 1, 'sum': 2.0}}
type(result)
str
kv_dict ={}
for row in result.collect():
_row = list(row.asDict().values())
print(_row)
kv_dict[_row[0]] = _row[1]
['taaaccoo', 1] ['piza', 1] ['hamburguer', 1] ['taco', 3] ['BEER', 1] ['pizzza', 1] ['arepa', 1] ['pizza', 4] ['Rice', 1] ['110790', 1] ['Cake', 1] ['null', 1] ['pasta', 2]
print(kv_dict)
{'taaaccoo': 1, 'piza': 1, 'hamburguer': 1, 'taco': 3, 'BEER': 1, 'pizzza': 1, 'arepa': 1, 'pizza': 4, 'Rice': 1, '110790': 1, 'Cake': 1, 'null': 1, 'pasta': 2}
a.cols.replace("product***LEVENSHTEIN_DISTANCE", 0, None).table()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-46-d4821b29c8c9> in <module> ----> 1 a.cols.replace("product***LEVENSHTEIN_DISTANCE", 0, None).table() AttributeError: 'str' object has no attribute 'cols'
a.rows.drop(where=((a["product_LEVENSHTEIN_1"]!=a["product_LEVENSHTEIN_2"])& (a["product***LEVENSHTEIN_DISTANCE"]==0))).table()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-47-fb2466275319> in <module> ----> 1 a.rows.drop(where=((a["product_LEVENSHTEIN_1"]!=a["product_LEVENSHTEIN_2"])& (a["product***LEVENSHTEIN_DISTANCE"]==0))).table() AttributeError: 'str' object has no attribute 'rows'