Pierre Gutierrez, Data Scientist at Dataiku (www.dataiku.com).
medicare (real) data can be found here : https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/Medicare-Provider-Charge-Data/Part-D-Prescriber.html
complements for other years can be found on propublica website : https://projects.propublica.org/data-store/
payments can be found here : http://www.cms.gov/OpenPayments/Explore-the-Data/Dataset-Downloads.html
npi exclusions can be found here : https://oig.hhs.gov/exclusions/exclusions_list.asp#instruct
FDA information come from : http://www.fda.gov/Drugs/InformationOnDrugs/ucm079750.htm
we provide the drug_paiments_companies file which is enables to join companies from fda with the ones given by . It was generated with fuzzy matching and is not considered exact. It can contain errors.
NB : Propublica has a lot of interesting articles if needed. Ex : on fraud : http://www.propublica.org/article/fraud-still-plagues-medicare-drug-program-watchdog-finds http://www.propublica.org/article/fanny-pack-mixup-unravels-massive-medicare-fraud-scheme http://www.propublica.org/article/how-fraud-flourishes-in-medicares-drug-plan
Interesting report on how to detect fraud : http://oig.hhs.gov/oei/reports/oei-02-09-00603.pdf
NB : - if you don't have a nice sql interface, use psycopg2 !
con = psycopg2.connect(host="localhost", user=USER, password=PASSWORD, database=DATABASE)
cur = con.cursor()
cur.execute('CRATE TABLE AS SELECT ... ')
con.commit()
import numpy as np
import pandas as pd
import seaborn as sns
import random as rd
import psycopg2
from pandas.io import sql
import scipy
# scikit learn
from sklearn import cross_validation
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import OneHotEncoder
#from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
# define the connexion
USER = *****
PASSWORD = *****
DATABASE = *****
con = psycopg2.connect(host="localhost", user=USER, password=PASSWORD, database=DATABASE)
q = " SELECT * from complete ; "
df = sql.read_frame(q, con)
df.columns
categorical_features = [u'speciality']
numerical_features = ['nb_diff_prescriptions','sum_prescr','sum_days','sum_cost'
,'var_prescr','var_days','var_cost'
,'max_prescr','max_days','max_cost']
# base features
numerical_features = numerical_features + []
# outliers features
numerical_features = numerical_features +['nb_perc_presc', 'nb_perc_days', 'nb_perc_cost', 'perc_unicost']
# graph features
numerical_features = numerical_features + ['score','weighted_score']
# payment features
numerical_features = numerical_features + ['payments_2013']
target = ['is_fraud']
allvars = categorical_features + numerical_features + target
y = df["is_fraud"].values
X = df[allvars].drop('is_fraud',axis=1)
X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(X, y, test_size=0.2, random_state=0)
print X_train.shape, X_valid.shape
# and this is a 5-fold cross validation example
#kf = cross_validation.KFold(X.shape[0], n_folds=5, random_state=0)
#for train_index, valid_index in kf:
# print("TRAIN:", len(train_index), "TEST:", len(valid_index))
# X_train, X_valid = X.values[train_index], X.values[valid_index]
# y_train, y_valid = y[train_index], y[valid_index]
# filling missing values
X_train[numerical_features] = X_train[numerical_features].fillna(0) # is mean better ?
X_valid[numerical_features] = X_valid[numerical_features].fillna(0) # is mean better ?
X_train[categorical_features] = X_train[categorical_features].fillna('unknown')
X_valid[categorical_features] = X_valid[categorical_features].fillna('unknown')
# rescaling
scaler= StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features].values)
X_valid[numerical_features] = scaler.transform(X_valid[numerical_features].values)
# dummify categorical features
vectorizer = DictVectorizer()
tmp = [dict(enumerate(sample)) for sample in X_train[categorical_features].values]
tmp2 = [dict(enumerate(sample)) for sample in X_valid[categorical_features].values]
X_train = scipy.sparse.hstack((X_train[numerical_features].values, vectorizer.fit_transform(tmp)))
X_valid = scipy.sparse.hstack((X_valid[numerical_features].values, vectorizer.transform(tmp2)))
# declare classifier
clf1 = LogisticRegression() # pimp me
clf2 = RandomForestClassifier(n_estimators =100, max_depth = 10, class_weight = 'auto') # pimp me
# train model 1
clf1.fit(X_train,y_train)
# train model 2
clf2.fit(X_train.toarray(),y_train)
# evaluate and plot roc curve 1
probas = clf1.predict_proba(X_valid)
fpr, tpr, thresholds = roc_curve(y_valid, probas[:, 1])
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % roc_auc)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
# evaluate and plot roc curve 1
probas = clf2.predict_proba(X_valid.toarray())
fpr, tpr, thresholds = roc_curve(y_valid, probas[:, 1])
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % roc_auc)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
# That's all ! It's now up to you to create your own features and to tune your algorithm !