Show code
In [1]:
%pylab inline
Populating the interactive namespace from numpy and matplotlib
In [2]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd
In [3]:
# Example: load a DSS dataset as a Pandas dataframe
dataset_orders = dataiku.Dataset("orders")
df = dataset_orders.get_dataframe()
df.head()
Out[3]:
order_date pages_visited order_id customer_id tshirt_category tshirt_price tshirt_quantity
0 2016/09/04 9 HTS-038040-0002 038040 White T-Shirt M 20.0 1
1 2014/11/14 11 HTS-801797-0001 801797 White T-Shirt M 20.0 1
2 2017/02/26 10 HTS-vft1eu-0003 vft1eu White T-Shirt F 18.0 3
3 2013/12/01 10 HTS-914324-0001 914324 Wh Tshirt F 18.0 1
4 2015/10/22 12 HTS-88ua9r-0001 88ua9r White T-Shirt M 20.0 1
In [4]:
pdu.audit(df)
Out[4]:
_a_variable _b_data_type _c_cardinality _d_missings _e_sample_values
0 order_date object 1431 0 [2016/09/04, 2014/11/14]
1 pages_visited int64 26 0 [9, 11]
2 order_id object 26511 0 [HTS-038040-0002, HTS-801797-0001]
3 customer_id object 11651 0 [038040, 801797]
4 tshirt_category object 10 0 [White T-Shirt M, White T-Shirt F]
5 tshirt_price float64 6 0 [20.0, 18.0]
6 tshirt_quantity int64 28 0 [1, 3]