El objetivo de este notebook es implementar el algoritmo de arboles. Los arboles puedes ser tuilizados para prediccion y para clasificacion. Como Miguel lo explico, existen diferentes medidas para realizar la seleccion de la variable de un arbol.
Para iniciar vamos a trabajar con datos categoricos:
$H_{(S)} = \sum_{i=1}^{C}{-p_i \log_2{p_i}}$
$H_{(T,X)} = \sum_{c \in{X}}{p_{(c)}{H_{(c)}} }$
$Gain_{(T,X)} = H_{(T)} - H_{(T,X)}$
Algunos links the referencia.
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import networkx as nx
import matplotlib
from sklearn import datasets
from networkx.drawing.nx_agraph import graphviz_layout
%matplotlib inline
iris = datasets.load_iris()
df_x = pd.DataFrame(iris['data'], columns=iris.feature_names )
df_x.head(5)
df_trans = df_x.apply(lambda x: pd.qcut(x,6, False),axis=0)
y = iris['target']
df_trans.head(5)
class Tree():
def __init__(self, stop=4, verbose=False, col_names=None):
self.G = nx.DiGraph()
self.verbose = verbose
self.col_names = col_names
self.stop = stop
def train(self,X,y):
def create_mask(r_ix_aux, r_ix):
t = 0
for ix,ix_val in enumerate(r_ix_aux):
if ix_val:
r_ix_aux[ix] = r_ix[t]
t += 1
else:
r_ix_aux[ix] = False
return r_ix_aux
X_aux = X.copy()
y_aux = y.copy()
w_queue = []
c_ix,h_val,g_val = self.calculate_gain(X,y)
node_prev = self.get_name(c_ix)
self.G.add_node(node_prev)
uniques, counts = np.unique(X_aux[:,c_ix], return_counts=True)
r_ix = [True]*X.shape[0]
w_queue.extend([(c_ix,r_ix.copy(),ele,node_prev,) for ele in uniques])
while len(w_queue)>0:
c_ix, r_ix, ele, node_prev = w_queue.pop(0)
r_ix_aux = r_ix
r_ix = np.array(np.equal(X[r_ix,c_ix],[ele]))
r_ix = create_mask(r_ix_aux, r_ix)
c_ix,h_val,g_val = self.calculate_gain(X_aux[r_ix,:]
,y_aux[r_ix])
print('control', c_ix,g_val,sum(g_val),len(w_queue) )
if sum(h_val) > 0 and sum(g_val) > 0 :
node_cur = self.get_name(c_ix)
self.G.add_node(node_cur)
self.G.add_edge(node_prev,
node_cur,
attr_dict={"name":ele,"col":c_ix})
uniques, counts = np.unique(X_aux[r_ix,c_ix], return_counts=True)
w_queue.extend([(c_ix,r_ix.copy(),ele,node_cur) for ele in uniques])
if sum(h_val) == 0.0 :
uniques, counts = np.unique(y_aux[r_ix],
return_counts=True)
for unique,count in zip(uniques,counts):
final_node = node_prev+"-"+str(unique)
self.G.add_node(final_node)
self.G.add_edge(node_prev,
final_node,
attr_dict={"name":ele,"col":count})
def calculate_gain(self, X,y):
"""Calcula la ganancia"""
ini_h = self.entropy(y)
if len(X.shape)==1:
X = X.reshape(-1,1)
n = X.shape[1]
t_gain = []
t_entropy = []
for ix in range(n):
h_Xy = self.entropy_2v(X[:,ix],y)
t_gain.append(ini_h-h_Xy)
t_entropy.append(h_Xy)
ix_sel = np.argmax(t_gain)
return ix_sel, t_entropy, t_gain
def entropy(self, y):
n = np.array(y).shape[0]
unique, counts = np.unique(y, return_counts=True)
f_plog = lambda x : -x*math.log(x,2)
h = sum( [f_plog(c*1.0/n) for c in counts])
if self.verbose:
msj = "Num {0} Counts{1} Entropy {2:.02f}"
print (msj.format(unique, counts, h))
return h
def entropy_2v(self,x,y):
n = np.array(x).shape[0]
uniques, counts = np.unique(x, return_counts=True)
h_st = 0
for unique, counts in zip(uniques, counts):
prob_c = counts*1.0/n
ix = np.array(np.equal(x, [unique]))
h_st += prob_c * self.entropy(y[ix])
if self.verbose:
msj = "Prob {0:.02f} Entropy {0:.02f}"
print (msj.format(prob_c, self.entropy(y[ix])) )
return h_st
def predict():
pass
def get_name(self,ix):
return self.col_names[ix] if self.col_names is not None else str(ix)
m_tree = Tree(verbose=True, col_names=df_trans.columns.values)
m_tree.train(df_trans.values, y)
data = [["D1","Sunny","Hot","High","Weak","No"],
["D2","Sunny","Hot","High","Strong","No"],
["D3","Overcast","Hot","High","Weak","Yes"],
["D4","Rain","Mild","High","Weak","Yes"],
["D5","Rain","Cool","Normal","Weak","Yes"],
["D6","Rain","Cool","Normal","Strong","No"],
["D7","Overcast","Cool","Normal","Strong","Yes"],
["D8","Sunny","Mild","High","Weak","No"],
["D9","Sunny","Cool","Normal","Weak","Yes"],
["D10","Rain","Mild","Normal","Weak","Yes"],
["D11","Sunny","Mild","Normal","Strong","Yes"],
["D12","Overcast","Mild","High","Strong","Yes"],
["D13","Overcast","Hot","Normal","Weak","Yes"],
["D14","Rain","Mild","High","Strong","No"]]
columns=["Day","Outlook","Temperature","Humidity","Wind","Play Golf"]
df_golf = pd.DataFrame(data,columns=columns )
m_tree = Tree(verbose=True,col_names=columns[1:5] )
m_tree.train(df_golf.iloc[:,1:5].values,df_golf.iloc[:,-1].values)
#m_tree.entropy_2v(df_golf.iloc[:,4],df_golf.iloc[:,-1].values)
edges_n = {(u,v):m_tree.G.get_edge_data(u,v)['attr_dict']['name']
for u,v in m_tree.G.edges()}
edges_n
plt.title('Tree Model')
pos = graphviz_layout(m_tree.G, prog='dot')
edges_n = {(u,v):m_tree.G.get_edge_data(u,v)['attr_dict']['name']
for u,v in m_tree.G.edges()}
nx.draw_networkx_edge_labels(m_tree.G,pos,edge_labels=edges_n)
nx.draw(m_tree.G, pos, with_labels=True, arrows=True)
plt.show()
import graphviz
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
clf = tree.DecisionTreeClassifier(random_state=0,criterion='entropy')
clf.fit(df_trans, y)
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph