%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")
from optimus import Optimus
# Create optimus
_load = Optimus("ibis", verbose = True)
# Put your db credentials here
db = _load.connect(
driver="impala",
host="35.223.122.210",
database= "default",
user= "cloudera",
password = "cloudera")
impala://cloudera:cloudera@35.223.122.210:10000/default
df = db.table_to_df("prueba")
_processing = Optimus("pandas", verbose = True)
df = _processing.from_dataframe(df)
# https://github.com/cloudera/impyla/issues/315
# pip uninstall pure-sasl
#pip uninstall sasl
!pip install pure-sasl
Collecting pure-sasl Downloading pure-sasl-0.6.2.tar.gz (11 kB) Building wheels for collected packages: pure-sasl Building wheel for pure-sasl (setup.py): started Building wheel for pure-sasl (setup.py): finished with status 'done' Created wheel for pure-sasl: filename=pure_sasl-0.6.2-py3-none-any.whl size=11435 sha256=5aa6d172a618efdeb1b9a62e705812572f668681aed405d6f40a258c76176ab5 Stored in directory: c:\users\argenisleon\appdata\local\pip\cache\wheels\60\6d\c4\eeb9f5eae120dce84399f49fc92b447f5b9c2788217aee3209 Successfully built pure-sasl Installing collected packages: pure-sasl Successfully installed pure-sasl-0.6.2
import ibis
hdfs = ibis.impala.hdfs_connect(host='localhost')
con = ibis.impala.connect(user="cloudera", password="cloudera",
host='35.223.122.210', port= 21050, database='default'
)
Without an HDFS connection, certain functionality may be disabled
con.list_tables()
['prueba']
table = con.table('prueba', database='default')
table.compile()
'SELECT *\nFROM default.`prueba`'
dir(table)
['__bool__', '__call__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_alter_table_helper', '_assert_valid', '_client', '_database', '_ensure_expr', '_execute', '_factory', '_get_type', '_is_materialized', '_is_valid', '_key', '_match_name', '_qualified_name', '_repr', '_repr_png_', '_resolve', '_root_tables', '_safe_name', '_type_display', '_unqualified_name', 'add_partition', 'aggregate', 'alter', 'alter_partition', 'anti_join', 'any_inner_join', 'any_left_join', 'asof_join', 'column_stats', 'columns', 'compile', 'compute_stats', 'count', 'cross_join', 'describe_formatted', 'difference', 'distinct', 'drop', 'drop_partition', 'equals', 'execute', 'files', 'filter', 'get_column', 'get_columns', 'group_by', 'groupby', 'head', 'info', 'inner_join', 'insert', 'intersect', 'invalidate_metadata', 'is_partitioned', 'join', 'left_join', 'limit', 'load_data', 'materialize', 'metadata', 'mutate', 'name', 'op', 'outer_join', 'partition_schema', 'partitions', 'pipe', 'projection', 'prueba1', 'refresh', 'relabel', 'rename', 'rowid', 'schema', 'select', 'semi_join', 'set_column', 'set_external', 'sort_by', 'stats', 'to_array', 'truncate', 'union', 'verify', 'view', 'visualize']
table.execute()
# dir(table)
prueba1 | |
---|---|
0 | prueba |
con.tables()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-13-95c5b7e060c4> in <module> ----> 1 con.tables() AttributeError: 'ImpalaClient' object has no attribute 'tables'
db.tables()
['HDFS', 'ImpalaClient', 'ImpalaConnection', 'ImpalaDatabase', 'ImpalaTable', 'WebHDFS', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'add_operation', 'aggregate_function', 'client', 'com', 'compat', 'compile', 'compiler', 'connect', 'ddl', 'dialect', 'hdfs_connect', 'options', 'scalar_function', 'udf', 'verify', 'wrap_uda', 'wrap_udf']
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-19-cdef22199e9a> in <module> ----> 1 db.tables() ~\Documents\Optimus\optimus\engines\ibis\io\jdbc.py in tables(self, schema, database, limit) 53 def tables(self, schema=None, database=None, limit=None): 54 print(dir(ibis.impala)) ---> 55 con = ibis.impala.connect(url=self.uri) 56 return con.list_tables() 57 TypeError: connect() got an unexpected keyword argument 'url'
db.table_to_df("prueba").ext.display()
--------------------------------------------------------------------------- TProtocolException Traceback (most recent call last) ~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args) 1178 -> 1179 context = constructor(dialect, self, conn, *args) 1180 except BaseException as e: ~\Anaconda3\lib\site-packages\sqlalchemy\engine\default.py in _init_statement(cls, dialect, connection, dbapi_connection, statement, parameters) 908 --> 909 self.cursor = self.create_cursor() 910 return self ~\Anaconda3\lib\site-packages\impala\sqlalchemy.py in create_cursor(self) 136 cursor_configuration = self.execution_options.get('cursor_configuration', {}) --> 137 return self._dbapi_connection.cursor(configuration=cursor_configuration) 138 ~\Anaconda3\lib\site-packages\sqlalchemy\pool\base.py in cursor(self, *args, **kwargs) 964 """ --> 965 return self.connection.cursor(*args, **kwargs) 966 ~\Anaconda3\lib\site-packages\impala\hiveserver2.py in cursor(self, user, configuration, convert_types, dictify, fetch_error) 123 --> 124 session = self.service.open_session(user, configuration) 125 ~\Anaconda3\lib\site-packages\impala\hiveserver2.py in open_session(self, user, configuration) 1074 configuration=configuration) -> 1075 resp = self._rpc('OpenSession', req) 1076 return HS2Session(self, resp.sessionHandle, ~\Anaconda3\lib\site-packages\impala\hiveserver2.py in _rpc(self, func_name, request) 992 self._log_request(func_name, request) --> 993 response = self._execute(func_name, request) 994 self._log_response(func_name, response) ~\Anaconda3\lib\site-packages\impala\hiveserver2.py in _execute(self, func_name, request) 1009 func = getattr(self.client, func_name) -> 1010 return func(request) 1011 except socket.error: ~\Anaconda3\lib\site-packages\thriftpy2\thrift.py in _req(self, _api, *args, **kwargs) 218 if not getattr(result_cls, "oneway"): --> 219 return self._recv(_api) 220 ~\Anaconda3\lib\site-packages\thriftpy2\thrift.py in _recv(self, _api) 230 def _recv(self, _api): --> 231 fname, mtype, rseqid = self._iprot.read_message_begin() 232 if mtype == TMessageType.EXCEPTION: ~\Anaconda3\lib\site-packages\thriftpy2\protocol\binary.py in read_message_begin(self) 372 api, ttype, seqid = read_message_begin( --> 373 self.trans, strict=self.strict_read) 374 return api, ttype, seqid ~\Anaconda3\lib\site-packages\thriftpy2\protocol\binary.py in read_message_begin(inbuf, strict) 178 raise TProtocolException(type=TProtocolException.BAD_VERSION, --> 179 message='No protocol version header') 180 TProtocolException: No protocol version header The above exception was the direct cause of the following exception: StatementError Traceback (most recent call last) <ipython-input-13-c94897551148> in <module> ----> 1 db.table_to_df("prueba").ext.display() ~\Documents\Optimus\optimus\engines\ibis\io\jdbc.py in table_to_df(self, table_name, columns, limit) 59 if limit is None: 60 limit = LIMIT_TABLE ---> 61 return IbisDataFrame(db.table(table_name).limit(limit)) ~\Anaconda3\lib\site-packages\ibis\backends\mysql\client.py in table(self, name, database, schema) 202 return self.database(name=database).table(name=name, schema=schema) 203 else: --> 204 alch_table = self._get_sqla_table(name, schema=schema) 205 node = self.table_class(alch_table, self, self._schemas.get(name)) 206 return self.table_expr_class(node) ~\Anaconda3\lib\site-packages\ibis\backends\base_sqlalchemy\alchemy.py in _get_sqla_table(self, name, schema, autoload) 1298 1299 def _get_sqla_table(self, name, schema=None, autoload=True): -> 1300 return sa.Table(name, self.meta, schema=schema, autoload=autoload) 1301 1302 def _sqla_table_to_expr(self, table): <string> in __new__(cls, *args, **kw) ~\Anaconda3\lib\site-packages\sqlalchemy\util\deprecations.py in warned(fn, *args, **kwargs) 126 ) 127 --> 128 return fn(*args, **kwargs) 129 130 doc = fn.__doc__ is not None and fn.__doc__ or "" ~\Anaconda3\lib\site-packages\sqlalchemy\sql\schema.py in __new__(cls, *args, **kw) 494 except: 495 with util.safe_reraise(): --> 496 metadata._remove_table(name, schema) 497 498 @property ~\Anaconda3\lib\site-packages\sqlalchemy\util\langhelpers.py in __exit__(self, type_, value, traceback) 66 self._exc_info = None # remove potential circular references 67 if not self.warn_only: ---> 68 compat.reraise(exc_type, exc_value, exc_tb) 69 else: 70 if not compat.py3k and self._exc_info and self._exc_info[1]: ~\Anaconda3\lib\site-packages\sqlalchemy\util\compat.py in reraise(tp, value, tb, cause) 151 if value.__traceback__ is not tb: 152 raise value.with_traceback(tb) --> 153 raise value 154 155 def u(s): ~\Anaconda3\lib\site-packages\sqlalchemy\sql\schema.py in __new__(cls, *args, **kw) 489 metadata._add_table(name, schema, table) 490 try: --> 491 table._init(name, metadata, *args, **kw) 492 table.dispatch.after_parent_attach(table, metadata) 493 return table ~\Anaconda3\lib\site-packages\sqlalchemy\sql\schema.py in _init(self, name, metadata, *args, **kwargs) 583 include_columns, 584 _extend_on=_extend_on, --> 585 resolve_fks=resolve_fks, 586 ) 587 ~\Anaconda3\lib\site-packages\sqlalchemy\sql\schema.py in _autoload(self, metadata, autoload_with, include_columns, exclude_columns, resolve_fks, _extend_on) 624 exclude_columns, 625 resolve_fks, --> 626 _extend_on=_extend_on, 627 ) 628 ~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in run_callable(self, callable_, *args, **kwargs) 2148 """ 2149 with self._contextual_connect() as conn: -> 2150 return conn.run_callable(callable_, *args, **kwargs) 2151 2152 def execute(self, statement, *multiparams, **params): ~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in run_callable(self, callable_, *args, **kwargs) 1602 1603 """ -> 1604 return callable_(self, *args, **kwargs) 1605 1606 def _run_visitor(self, visitorcallable, element, **kwargs): ~\Anaconda3\lib\site-packages\sqlalchemy\engine\default.py in reflecttable(self, connection, table, include_columns, exclude_columns, resolve_fks, **opts) 429 insp = reflection.Inspector.from_engine(connection) 430 return insp.reflecttable( --> 431 table, include_columns, exclude_columns, resolve_fks, **opts 432 ) 433 ~\Anaconda3\lib\site-packages\sqlalchemy\engine\reflection.py in reflecttable(self, table, include_columns, exclude_columns, resolve_fks, _extend_on) 638 639 for col_d in self.get_columns( --> 640 table_name, schema, **table.dialect_kwargs 641 ): 642 found_table = True ~\Anaconda3\lib\site-packages\sqlalchemy\engine\reflection.py in get_columns(self, table_name, schema, **kw) 371 372 col_defs = self.dialect.get_columns( --> 373 self.bind, table_name, schema, info_cache=self.info_cache, **kw 374 ) 375 for col_def in col_defs: ~\Anaconda3\lib\site-packages\impala\sqlalchemy.py in get_columns(self, connection, table_name, schema, **kwargs) 204 name = '%s.%s' % (schema, name) 205 query = 'SELECT * FROM %s LIMIT 0' % name --> 206 cursor = connection.execute(query) 207 schema = cursor.cursor.description 208 # We need to fetch the empty results otherwise these queries remain in ~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in execute(self, object_, *multiparams, **params) 980 """ 981 if isinstance(object_, util.string_types[0]): --> 982 return self._execute_text(object_, multiparams, params) 983 try: 984 meth = object_._execute_on_connection ~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _execute_text(self, statement, multiparams, params) 1153 parameters, 1154 statement, -> 1155 parameters, 1156 ) 1157 if self._has_events or self.engine._has_events: ~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args) 1180 except BaseException as e: 1181 self._handle_dbapi_exception( -> 1182 e, util.text_type(statement), parameters, None, None 1183 ) 1184 ~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _handle_dbapi_exception(self, e, statement, parameters, cursor, context) 1464 util.raise_from_cause(newraise, exc_info) 1465 elif should_wrap: -> 1466 util.raise_from_cause(sqlalchemy_exception, exc_info) 1467 else: 1468 util.reraise(*exc_info) ~\Anaconda3\lib\site-packages\sqlalchemy\util\compat.py in raise_from_cause(exception, exc_info) 396 exc_type, exc_value, exc_tb = exc_info 397 cause = exc_value if exc_value is not exception else None --> 398 reraise(type(exception), exception, tb=exc_tb, cause=cause) 399 400 ~\Anaconda3\lib\site-packages\sqlalchemy\util\compat.py in reraise(tp, value, tb, cause) 150 value.__cause__ = cause 151 if value.__traceback__ is not tb: --> 152 raise value.with_traceback(tb) 153 raise value 154 ~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args) 1177 conn = self._revalidate_connection() 1178 -> 1179 context = constructor(dialect, self, conn, *args) 1180 except BaseException as e: 1181 self._handle_dbapi_exception( ~\Anaconda3\lib\site-packages\sqlalchemy\engine\default.py in _init_statement(cls, dialect, connection, dbapi_connection, statement, parameters) 907 self.statement = self.unicode_statement = statement 908 --> 909 self.cursor = self.create_cursor() 910 return self 911 ~\Anaconda3\lib\site-packages\impala\sqlalchemy.py in create_cursor(self) 135 self._is_server_side = False 136 cursor_configuration = self.execution_options.get('cursor_configuration', {}) --> 137 return self._dbapi_connection.cursor(configuration=cursor_configuration) 138 139 ~\Anaconda3\lib\site-packages\sqlalchemy\pool\base.py in cursor(self, *args, **kwargs) 963 964 """ --> 965 return self.connection.cursor(*args, **kwargs) 966 967 def __getattr__(self, key): ~\Anaconda3\lib\site-packages\impala\hiveserver2.py in cursor(self, user, configuration, convert_types, dictify, fetch_error) 122 log.debug('.cursor(): getting new session_handle') 123 --> 124 session = self.service.open_session(user, configuration) 125 126 log.debug('HiveServer2Cursor(service=%s, session_handle=%s, ' ~\Anaconda3\lib\site-packages\impala\hiveserver2.py in open_session(self, user, configuration) 1073 username=user, 1074 configuration=configuration) -> 1075 resp = self._rpc('OpenSession', req) 1076 return HS2Session(self, resp.sessionHandle, 1077 resp.configuration, ~\Anaconda3\lib\site-packages\impala\hiveserver2.py in _rpc(self, func_name, request) 991 def _rpc(self, func_name, request): 992 self._log_request(func_name, request) --> 993 response = self._execute(func_name, request) 994 self._log_response(func_name, response) 995 err_if_rpc_not_ok(response) ~\Anaconda3\lib\site-packages\impala\hiveserver2.py in _execute(self, func_name, request) 1008 log.debug('Transport opened') 1009 func = getattr(self.client, func_name) -> 1010 return func(request) 1011 except socket.error: 1012 log.exception('Failed to open transport (tries_left=%s)', ~\Anaconda3\lib\site-packages\thriftpy2\thrift.py in _req(self, _api, *args, **kwargs) 217 # wait result only if non-oneway 218 if not getattr(result_cls, "oneway"): --> 219 return self._recv(_api) 220 221 def _send(self, _api, **kwargs): ~\Anaconda3\lib\site-packages\thriftpy2\thrift.py in _recv(self, _api) 229 230 def _recv(self, _api): --> 231 fname, mtype, rseqid = self._iprot.read_message_begin() 232 if mtype == TMessageType.EXCEPTION: 233 x = TApplicationException() ~\Anaconda3\lib\site-packages\thriftpy2\protocol\binary.py in read_message_begin(self) 371 def read_message_begin(self): 372 api, ttype, seqid = read_message_begin( --> 373 self.trans, strict=self.strict_read) 374 return api, ttype, seqid 375 ~\Anaconda3\lib\site-packages\thriftpy2\protocol\binary.py in read_message_begin(inbuf, strict) 177 if strict: 178 raise TProtocolException(type=TProtocolException.BAD_VERSION, --> 179 message='No protocol version header') 180 181 name = inbuf.read(sz).decode('utf-8') StatementError: (thriftpy2.protocol.exc.TProtocolException) No protocol version header [SQL: SELECT * FROM prueba LIMIT 0]
df.head()
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | I182070945 | 00619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | 2018-09-02 13:00:00 | 2018 | 9 | Sunday | 13 | Part One | LINCOLN ST | 42.35779134 | -71.13937053 | (42.35779134, -71.13937053) | |
1 | I182070943 | 01402 | Vandalism | VANDALISM | C11 | 347 | 2018-08-21 00:00:00 | 2018 | 8 | Tuesday | 0 | Part Two | HECLA ST | 42.30682138 | -71.06030035 | (42.30682138, -71.06030035) | |
2 | I182070941 | 03410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | 2018-09-03 19:27:00 | 2018 | 9 | Monday | 19 | Part Three | CAZENOVE ST | 42.34658879 | -71.07242943 | (42.34658879, -71.07242943) | |
3 | I182070940 | 03114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | 2018-09-03 21:16:00 | 2018 | 9 | Monday | 21 | Part Three | NEWCOMB ST | 42.33418175 | -71.07866441 | (42.33418175, -71.07866441) | |
4 | I182070938 | 03114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | 2018-09-03 21:05:00 | 2018 | 9 | Monday | 21 | Part Three | DELHI ST | 42.27536542 | -71.09036101 | (42.27536542, -71.09036101) | |
5 | I182070936 | 03820 | Motor Vehicle Accident Response | M/V ACCIDENT INVOLVING PEDESTRIAN - INJURY | C11 | 398 | 2018-09-03 21:09:00 | 2018 | 9 | Monday | 21 | Part Three | TALBOT AVE | 42.29019621 | -71.07159012 | (42.29019621, -71.07159012) | |
6 | I182070933 | 00724 | Auto Theft | AUTO THEFT | B2 | 330 | 2018-09-03 21:25:00 | 2018 | 9 | Monday | 21 | Part One | NORMANDY ST | 42.30607218 | -71.08273260 | (42.30607218, -71.08273260) | |
7 | I182070932 | 03301 | Verbal Disputes | VERBAL DISPUTE | B2 | 584 | 2018-09-03 20:39:37 | 2018 | 9 | Monday | 20 | Part Three | LAWN ST | 42.32701648 | -71.10555088 | (42.32701648, -71.10555088) | |
8 | I182070931 | 00301 | Robbery | ROBBERY - STREET | C6 | 177 | 2018-09-03 20:48:00 | 2018 | 9 | Monday | 20 | Part One | MASSACHUSETTS AVE | 42.33152148 | -71.07085307 | (42.33152148, -71.07085307) | |
9 | I182070929 | 03301 | Verbal Disputes | VERBAL DISPUTE | C11 | 364 | 2018-09-03 20:38:00 | 2018 | 9 | Monday | 20 | Part Three | LESLIE ST | 42.29514664 | -71.05860832 | (42.29514664, -71.05860832) |
df1 = df.cols.lower("INCIDENT_NUMBER")
df.to_pandas()
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | I182070945 | 00619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | 2018-09-02 13:00:00 | 2018 | 9 | Sunday | 13 | Part One | LINCOLN ST | 42.35779134 | -71.13937053 | (42.35779134, -71.13937053) | |
1 | I182070943 | 01402 | Vandalism | VANDALISM | C11 | 347 | 2018-08-21 00:00:00 | 2018 | 8 | Tuesday | 0 | Part Two | HECLA ST | 42.30682138 | -71.06030035 | (42.30682138, -71.06030035) | |
2 | I182070941 | 03410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | 2018-09-03 19:27:00 | 2018 | 9 | Monday | 19 | Part Three | CAZENOVE ST | 42.34658879 | -71.07242943 | (42.34658879, -71.07242943) | |
3 | I182070940 | 03114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | 2018-09-03 21:16:00 | 2018 | 9 | Monday | 21 | Part Three | NEWCOMB ST | 42.33418175 | -71.07866441 | (42.33418175, -71.07866441) | |
4 | I182070938 | 03114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | 2018-09-03 21:05:00 | 2018 | 9 | Monday | 21 | Part Three | DELHI ST | 42.27536542 | -71.09036101 | (42.27536542, -71.09036101) | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
319068 | I050310906-00 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 285 | 2016-06-05 17:25:00 | 2016 | 6 | Sunday | 17 | Part Three | COVENTRY ST | 42.33695098 | -71.08574813 | (42.33695098, -71.08574813) | |
319069 | I030217815-08 | 00111 | Homicide | MURDER, NON-NEGLIGIENT MANSLAUGHTER | E18 | 520 | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part One | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) | |
319070 | I030217815-08 | 03125 | Warrant Arrests | WARRANT ARREST | E18 | 520 | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part Three | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) | |
319071 | I010370257-00 | 03125 | Warrant Arrests | WARRANT ARREST | E13 | 569 | 2016-05-31 19:35:00 | 2016 | 5 | Tuesday | 19 | Part Three | NEW WASHINGTON ST | 42.30233307 | -71.11156487 | (42.30233307, -71.11156487) | |
319072 | 142052550 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 903 | 2015-06-22 00:12:00 | 2015 | 6 | Monday | 0 | Part Three | WASHINGTON ST | 42.33383935 | -71.08029038 | (42.33383935, -71.08029038) |
319073 rows × 17 columns
df.to_optimus_pandas().cols.lower("OFFENSE_DESCRIPTION")
INCIDENT_NUMBER
1 (object)
not nullable
|
OFFENSE_CODE
2 (object)
not nullable
|
OFFENSE_CODE_GROUP
3 (object)
not nullable
|
OFFENSE_DESCRIPTION
4 (object)
not nullable
|
DISTRICT
5 (object)
not nullable
|
REPORTING_AREA
6 (object)
not nullable
|
SHOOTING
7 (object)
not nullable
|
OCCURRED_ON_DATE
8 (object)
not nullable
|
YEAR
9 (object)
not nullable
|
MONTH
10 (object)
not nullable
|
DAY_OF_WEEK
11 (object)
not nullable
|
HOUR
12 (object)
not nullable
|
UCR_PART
13 (object)
not nullable
|
STREET
14 (object)
not nullable
|
Lat
15 (object)
not nullable
|
Long
16 (object)
not nullable
|
Location
17 (object)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
00619
|
Larceny
|
larceny⋅all⋅others
|
D14
|
808
|
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sunday
|
13
|
Part⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
I182070943
|
01402
|
Vandalism
|
vandalism
|
C11
|
347
|
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesday
|
0
|
Part⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
I182070941
|
03410
|
Towed
|
towed⋅motor⋅vehicle
|
D4
|
151
|
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
I182070940
|
03114
|
Investigate⋅Property
|
investigate⋅property
|
D4
|
272
|
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
I182070938
|
03114
|
Investigate⋅Property
|
investigate⋅property
|
B3
|
421
|
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
I182070936
|
03820
|
Motor⋅Vehicle⋅Accident⋅Response
|
m/v⋅accident⋅involving⋅pedestrian⋅-⋅injury
|
C11
|
398
|
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
I182070933
|
00724
|
Auto⋅Theft
|
auto⋅theft
|
B2
|
330
|
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Monday
|
21
|
Part⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.08273260
|
(42.30607218,⋅-71.08273260)
|
I182070932
|
03301
|
Verbal⋅Disputes
|
verbal⋅dispute
|
B2
|
584
|
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
I182070931
|
00301
|
Robbery
|
robbery⋅-⋅street
|
C6
|
177
|
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Monday
|
20
|
Part⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
<class 'optimus.engines.pandas.dataframe.PandasDataFrame'>
df.meta
{'profile': {'columns': {'INCIDENT_NUMBER': {'profiler_dtype': {'dtype': 'string'}, 'stats': {'match': 319073, 'missing': 0, 'mismatch': 0, 'profiler_dtype': {'dtype': 'string'}, 'frequency': [{'value': 'I162030584', 'count': 13}, {'value': 'I152080623', 'count': 11}, {'value': 'I172013170', 'count': 10}, {'value': 'I182065208', 'count': 10}, {'value': 'I172096394', 'count': 10}, {'value': 'I162071327', 'count': 9}, {'value': 'I162001871', 'count': 9}, {'value': 'I172056883', 'count': 9}, {'value': 'I172054429', 'count': 9}, {'value': 'I162098170', 'count': 9}, {'value': 'I172022524', 'count': 9}, {'value': 'I152076465', 'count': 8}, {'value': 'I172053616', 'count': 8}, {'value': 'I152105431', 'count': 8}, {'value': 'I162082917', 'count': 8}, {'value': 'I162064331', 'count': 8}, {'value': 'I162056703', 'count': 8}, {'value': 'I130041200-00', 'count': 8}, {'value': 'I162087224', 'count': 8}, {'value': 'I162022140', 'count': 8}, {'value': 'I172069723', 'count': 8}, {'value': 'I162090278', 'count': 8}, {'value': 'I162074826', 'count': 8}, {'value': 'I162078338', 'count': 8}, {'value': 'I152101399', 'count': 7}, {'value': 'I152095733', 'count': 7}, {'value': 'I162054378', 'count': 7}, {'value': 'I152067057', 'count': 7}, {'value': 'I152096998', 'count': 7}, {'value': 'I172018004', 'count': 7}, {'value': 'I162083089', 'count': 7}, {'value': 'I162045680', 'count': 7}, {'value': 'I152071480', 'count': 7}], 'count_uniques': 282517}, 'dtype': 'object'}}, 'name': None, 'file_name': None, 'summary': {'cols_count': 17, 'rows_count': 319073, 'dtypes_list': ['object'], 'total_count_dtypes': 1, 'missing_count': 0, 'p_missing': 0.0}}, 'transformations': {'actions': []}}
df[0]
INCIDENT_NUMBER
1 (object)
not nullable
|
---|
I182016471
|
I182062395
|
I162052877
|
I162067216
|
I172079919
|
I162027167
|
I162039177
|
I172093225
|
I162030885
|
I162028326
|
<class 'optimus.engines.dask.dataframe.DaskDataFrame'>
import dask
from dask.dataframe import from_delayed
def oset(col_name, expr, where):
where = (where.data[where.cols.names(0)[0]]).to_delayed()
ddf = df.to_delayed()
def func(_df, _col_name, _mask):
_df.loc[_mask,[_col_name]]=expr(_df)
return _df
for _part,_mask in zip(ddf, where):
delayed_parts = dask.delayed(func)(_part, col_name, _mask)
return from_delayed(delayed_parts)
col_name="INCIDENT_NUMBER"
where= ((df[col_name]=="I182070945") | (df["DISTRICT"]=="I182070945"))
F = df.functions
def expr(series):
return F.lower(series["OFFENSE_CODE_GROUP"])
oset(col_name, expr, where).compute()
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | larceny | 00619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | 2018-09-02 13:00:00 | 2018 | 9 | Sunday | 13 | Part One | LINCOLN ST | 42.35779134 | -71.13937053 | (42.35779134, -71.13937053) | |
1 | I182070943 | 01402 | Vandalism | VANDALISM | C11 | 347 | 2018-08-21 00:00:00 | 2018 | 8 | Tuesday | 0 | Part Two | HECLA ST | 42.30682138 | -71.06030035 | (42.30682138, -71.06030035) | |
2 | I182070941 | 03410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | 2018-09-03 19:27:00 | 2018 | 9 | Monday | 19 | Part Three | CAZENOVE ST | 42.34658879 | -71.07242943 | (42.34658879, -71.07242943) | |
3 | I182070940 | 03114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | 2018-09-03 21:16:00 | 2018 | 9 | Monday | 21 | Part Three | NEWCOMB ST | 42.33418175 | -71.07866441 | (42.33418175, -71.07866441) | |
4 | I182070938 | 03114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | 2018-09-03 21:05:00 | 2018 | 9 | Monday | 21 | Part Three | DELHI ST | 42.27536542 | -71.09036101 | (42.27536542, -71.09036101) | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
319068 | I050310906-00 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 285 | 2016-06-05 17:25:00 | 2016 | 6 | Sunday | 17 | Part Three | COVENTRY ST | 42.33695098 | -71.08574813 | (42.33695098, -71.08574813) | |
319069 | I030217815-08 | 00111 | Homicide | MURDER, NON-NEGLIGIENT MANSLAUGHTER | E18 | 520 | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part One | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) | |
319070 | I030217815-08 | 03125 | Warrant Arrests | WARRANT ARREST | E18 | 520 | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part Three | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) | |
319071 | I010370257-00 | 03125 | Warrant Arrests | WARRANT ARREST | E13 | 569 | 2016-05-31 19:35:00 | 2016 | 5 | Tuesday | 19 | Part Three | NEW WASHINGTON ST | 42.30233307 | -71.11156487 | (42.30233307, -71.11156487) | |
319072 | 142052550 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 903 | 2015-06-22 00:12:00 | 2015 | 6 | Monday | 0 | Part Three | WASHINGTON ST | 42.33383935 | -71.08029038 | (42.33383935, -71.08029038) |
319073 rows × 17 columns