import pandas as pd 
import statistics as stats
import sklearn.model_selection as skm
columnas = ['class', 'age', 'sex', 'histologic-type', 'degree-of-diffe', 'bone', 'bone-marrow', 'lung', 'pleura', 'peritoneum', 'liver', 'brain', 'skin', 'neck', 'supraclavicular', 'axillar', 'mediastinum', 'abdominal']
file = pd.read_csv('primary-tumor.data', sep=',', names = columnas) 
#We have to change the name of features as it was originally changed to numbers; the idea
#is to improve the interpretability of the data.


import sklearn as sk
import matplotlib.pyplot as plt
import statistics as stat
import numpy as np
import seaborn as sns
import scipy as sc
import random
!pip install miceforest 
import miceforest as mf

Requirement already satisfied: miceforest in /home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages (5.6.2)
Requirement already satisfied: blosc in /home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages (from miceforest) (1.10.6)
Requirement already satisfied: lightgbm>=3.3.1 in /home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages (from miceforest) (3.3.5)
Requirement already satisfied: numpy in /home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages (from miceforest) (1.23.5)
Requirement already satisfied: dill in /home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages (from miceforest) (0.3.6)
Requirement already satisfied: scipy in /home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages (from lightgbm>=3.3.1->miceforest) (1.10.0)
Requirement already satisfied: scikit-learn!=0.22.0 in /home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages (from lightgbm>=3.3.1->miceforest) (1.2.2)
Requirement already satisfied: wheel in /home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages (from lightgbm>=3.3.1->miceforest) (0.38.4)
Requirement already satisfied: joblib>=1.1.1 in /home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages (from scikit-learn!=0.22.0->lightgbm>=3.3.1->miceforest) (1.1.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages (from scikit-learn!=0.22.0->lightgbm>=3.3.1->miceforest) (2.2.0)


#We change the ? values for NaN.
file = file.replace('?', np.nan)

#MICE's kernel is created, using 4 datasets. 
#Firstly, we convert all the features into categorical [8]
for col in columnas:
    file[col] = file[col].astype('category',copy=False)
kernel = mf.ImputationKernel(
  file,
  save_all_iterations=True,
  random_state=1
)
kernel.mice(3)
completed_dataset = kernel.complete_data(dataset=0, inplace=False) #our new imputed dataset
print(completed_dataset.isnull().sum(0))

#kernel.plot_imputed_distributions(wspace=0.35,hspace=0.4)
completed_dataset.head(20)

class              0
age                0
sex                0
histologic-type    0
degree-of-diffe    0
bone               0
bone-marrow        0
lung               0
pleura             0
peritoneum         0
liver              0
brain              0
skin               0
neck               0
supraclavicular    0
axillar            0
mediastinum        0
abdominal          0
dtype: int64


classs =completed_dataset["class"]
inputs = completed_dataset.drop(columns='class')
columns = ['age', 'sex', 'histologic-type', 'degree-of-diffe', 'bone', 'bone-marrow', 'lung', 'pleura', 'peritoneum', 'liver', 'brain', 'skin', 'neck', 'supraclavicular', 'axillar', 'mediastinum', 'abdominal']
for col in columns:
    inputs[col] = inputs[col].astype('int64',copy=False)
inputs = inputs.replace(to_replace = 1, value = 0, regex = True)
inputs = inputs.replace(to_replace = 2, value = 1, regex = True)
inputs = inputs.replace(to_replace = 3, value = 2, regex = True)

completed_dataset=inputs
completed_dataset["class"]=classs

inputs.head(20)


completed_dataset.hist(figsize=(12, 12), bins=3, xlabelsize=8, ylabelsize=8)

array([[<Axes: title={'center': 'age'}>, <Axes: title={'center': 'sex'}>,
        <Axes: title={'center': 'histologic-type'}>,
        <Axes: title={'center': 'degree-of-diffe'}>],
       [<Axes: title={'center': 'bone'}>,
        <Axes: title={'center': 'bone-marrow'}>,
        <Axes: title={'center': 'lung'}>,
        <Axes: title={'center': 'pleura'}>],
       [<Axes: title={'center': 'peritoneum'}>,
        <Axes: title={'center': 'liver'}>,
        <Axes: title={'center': 'brain'}>,
        <Axes: title={'center': 'skin'}>],
       [<Axes: title={'center': 'neck'}>,
        <Axes: title={'center': 'supraclavicular'}>,
        <Axes: title={'center': 'axillar'}>,
        <Axes: title={'center': 'mediastinum'}>],
       [<Axes: title={'center': 'abdominal'}>, <Axes: >, <Axes: >,
        <Axes: >]], dtype=object)


completed_dataset["class"].hist(figsize=(3, 6), bins=13, xlabelsize=8, ylabelsize=8)

<Axes: >


import scipy.stats as stats
#We will calculate Cramer's Phi statistic, which is a variant of Student's T for ordinal discrete features.
X2 = stats.chi2_contingency(inputs, correction=False)[0]
n = np.sum(inputs)
minDim = min(inputs.shape)-1


V = np.sqrt((X2/n) / minDim)

print(V)

#The idea is to complement the Kendall data with Cramer data related to correlation.

age                0.615711
sex                0.951833
histologic-type    0.737287
degree-of-diffe    0.626927
bone               0.809029
bone-marrow        0.694990
lung               0.779373
pleura             0.779373
peritoneum         0.810686
liver              0.834994
brain              0.710123
skin               0.710123
neck               0.737287
supraclavicular    0.759495
axillar            0.723913
mediastinum        0.805747
abdominal          0.846103
dtype: float64

/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/numpy/core/fromnumeric.py:84: FutureWarning: The default value of numeric_only in DataFrame.sum is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  return reduction(axis=axis, out=out, **passkwargs)


plt.figure(figsize=(14,12))
sns.heatmap(inputs.corr(method = 'kendall'),linewidths=.1,cmap="YlGnBu", annot=True)
plt.yticks(rotation=0);

#As we can see hereunder, we do not have very correlated features. Kendall is more robust than Cramer's Phi
#because categorical features rarely follow a normal distribution (an assumption taken for Cramer's Phi)

/tmp/ipykernel_25089/487821749.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.heatmap(inputs.corr(method = 'kendall'),linewidths=.1,cmap="YlGnBu", annot=True)


completed_dataset[completed_dataset["class"] == 9]

completed_dataset[completed_dataset['class'] == 20] 
completed_dataset.at[313, 'class'] = 18
completed_dataset.at[312, 'class'] = 18
completed_dataset[completed_dataset['class'] == 19]
listt = [306, 307, 308, 309, 310, 311]
for i in range(len(listt)):
    completed_dataset.at[listt[i], 'class'] = 18
completed_dataset[completed_dataset['class'] == 21] 

completed_dataset.drop(314,axis=0, inplace=True) 

completed_dataset[completed_dataset['class'] == 16] 

completed_dataset.drop(266, axis = 0, inplace=True)
completed_dataset[completed_dataset['class'] == 8] 

listt = [181, 182, 183, 184, 185, 186]
for i in range(len(listt)):
    completed_dataset.at[listt[i], 'class'] = 7
completed_dataset[completed_dataset['class'] == 15]  

listt = [264, 265]
for i in range(len(listt)):
    completed_dataset.at[listt[i], 'class'] = 17

completed_dataset.drop(166, axis = 0, inplace=True)



completed_dataset.drop(187, axis = 0, inplace=True)
completed_dataset.drop(188, axis = 0, inplace=True)

output = completed_dataset['class']
inputs = completed_dataset.drop(columns='class')
output.cat.remove_categories(removals = [6, 10, 20, 19, 21, 16, 8, 15], inplace = True)

/tmp/ipykernel_25089/2090229956.py:37: FutureWarning: The `inplace` parameter in pandas.Categorical.remove_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  output.cat.remove_categories(removals = [6, 10, 20, 19, 21, 16, 8, 15], inplace = True)


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score

input_lr = inputs
features = ['age', 'sex', 'histologic-type', 'degree-of-diffe', 'bone', 'bone-marrow', 'lung', 'pleura', 'peritoneum', 'liver', 'brain', 'skin', 'neck', 'supraclavicular', 'axillar', 'mediastinum', 'abdominal', 'proba-of-rec']
input_lr['proba-of-rec'] = np.random.randint(0,  100, size=len(inputs["age"]))

#Dividing our dataset in train and test.

X_train_lr, X_test_lr, Y_train_lr, Y_test_lr = train_test_split(input_lr, output, test_size = 0.3, random_state = 42)

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X_train_lr, Y_train_lr)

train = []
validation = []
for train_index, validation_index in skf.split(X_train_lr, Y_train_lr):
    train_groups= []
    validation_groups= []
    train_groups.append(train_index)
    validation_groups.append(validation_index)
    train.append(train_groups)
    validation.append(validation_groups)


#We will assume that our data is a iid variable (identically distributed and independent); it is not a temporal series.
#This could be a limitation of our study that we are assuming for simplicity. [14]


regressor = LinearRegression()
scores_ecm = cross_val_score(regressor, X_train_lr, Y_train_lr, cv=skf, scoring = 'neg_mean_squared_error', n_jobs = -1)
scores_r2 = cross_val_score(regressor, X_train_lr, Y_train_lr, cv=skf, scoring = 'r2', n_jobs = -1)


print("MSE Scores: {}".format(scores_ecm))
print('R2 Scores : {}'.format(scores_r2))
print(np.mean(np.absolute(scores_ecm)))
print(np.mean(np.absolute(scores_r2)))


regressor = regressor.fit(X_train_lr,Y_train_lr) 
y_predt = regressor.predict(X_test_lr)

mse=mean_squared_error(Y_test_lr, y_predt)

print(mse)

r_squared=r2_score(Y_test_lr, y_predt)
print(r_squared)

#It is very probable that our bad results on r_squared are due to the synthetic variable and the use of linear regression
#on a mixed dataset; perhaps it would have been more useful to use decision trees or random forest, which 
#are rather more robust in this type of problems.

MSE Scores: [-36.65218678 -33.57854138 -31.56804281 -34.27544203 -38.09796994]
R2 Scores : [0.22349445 0.29448526 0.33388921 0.22345648 0.20651104]
34.8344365862262
0.2563672870231148
31.79196502864336
0.41860161119881145


listt = [X_train_lr['proba-of-rec'], X_train_lr['sex']]
test = pd.concat(listt, axis=1)

listt1 = [X_test_lr['proba-of-rec'], X_test_lr['sex']]
test2 = pd.concat(listt1, axis = 1)

regressor = regressor.fit(test,Y_train_lr) 
y_pred = regressor.predict(test2)

# Measuring mean squared error
mse=mean_squared_error(Y_test_lr, y_pred)

print(mse)

# Measuring R^2
r_squared=r2_score(Y_test_lr, y_pred)
print(r_squared)

47.838217259770865
0.12515434598353792


sns.regplot(x = y_predt, y = Y_test_lr)
plt.xlabel('y_pred')
plt.ylabel('y_test')
plt.show()
#As we can see, the pattern that underlies is more similar to a sigmoid than to a line.


from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

regressor = Lasso()
scores_ecm = cross_val_score(regressor, X_train_lr, Y_train_lr, cv=skf, scoring = 'neg_mean_squared_error', n_jobs = -1)
scores_r2 = cross_val_score(regressor, X_train_lr, Y_train_lr, cv=skf, scoring = 'r2', n_jobs = -1)


print("MSE Scores: {}".format(scores_ecm))
print('R2 Scores : {}'.format(scores_r2))
print(np.mean(np.absolute(scores_ecm)))
print(np.mean(np.absolute(scores_r2)))



regressor = regressor.fit(X_train_lr,Y_train_lr) 
y_predt = regressor.predict(X_test_lr)


mse=mean_squared_error(Y_test_lr, y_pred)

print(mse)

# Measuring R^2
r_squared=r2_score(Y_test_lr, y_pred)
print(r_squared)

MSE Scores: [-45.11285105 -45.90994605 -44.49348754 -41.52011915 -45.52234323]
R2 Scores : [0.04424859 0.03539158 0.06115205 0.05932126 0.05187922]
44.5117494048351
0.05039854182792134
47.838217259770865
0.12515434598353792


ridge = Ridge()
scores_ecm = cross_val_score(ridge, X_train_lr, Y_train_lr, cv=skf, scoring = 'neg_mean_squared_error', n_jobs = -1)
scores_r2 = cross_val_score(ridge, X_train_lr, Y_train_lr, cv=skf, scoring = 'r2', n_jobs = -1)


print("MSE Scores: {}".format(scores_ecm))
print('R2 Scores : {}'.format(scores_r2))
print(np.mean(np.absolute(scores_ecm)))
print(np.mean(np.absolute(scores_r2)))



regressor = ridge.fit(X_train_lr,Y_train_lr) 
y_predt = regressor.predict(X_test_lr)

mse=mean_squared_error(Y_test_lr, y_pred)

print(mse)

r_squared=r2_score(Y_test_lr, y_pred)
print(r_squared)

MSE Scores: [-37.26004778 -33.11538896 -30.85353803 -33.21020635 -38.17147922]
R2 Scores : [0.21061644 0.3042165  0.34896583 0.24759043 0.20498002]
34.52213206701642
0.26327384179540897
47.838217259770865
0.12515434598353792


listt = [X_train_lr['proba-of-rec'], X_train_lr['sex']]
test = pd.concat(listt, axis=1)

listt1 = [X_test_lr['proba-of-rec'], X_test_lr['sex']]
test2 = pd.concat(listt1, axis = 1)

regressor = ridge.fit(test,Y_train_lr) 
y_pred = regressor.predict(test2)

mse=mean_squared_error(Y_test_lr, y_pred)

print(mse)

r_squared=r2_score(Y_test_lr, y_pred)
print(r_squared)

47.85186082940176
0.12490483799012675


elasticnet = ElasticNet()
scores_mse = cross_val_score(elasticnet, X_train_lr, Y_train_lr, cv=skf, scoring = 'neg_mean_squared_error', n_jobs = -1)
scores_r2 = cross_val_score(elasticnet, X_train_lr, Y_train_lr, cv=skf, scoring = 'r2', n_jobs = -1)


print("MSE Scores: {}".format(scores_mse))
print('R2 Scores : {}'.format(scores_r2))
print(np.mean(np.absolute(scores_mse)))
print(np.mean(np.absolute(scores_r2)))



regressor = elasticnet.fit(X_train_lr,Y_train_lr) 
y_predt = regressor.predict(X_test_lr)


mse=mean_squared_error(Y_test_lr, y_pred)

print(mse)


r_squared=r2_score(Y_test_lr, y_pred)
print(r_squared)

MSE Scores: [-43.78846996 -43.55475806 -41.93985295 -39.25624843 -44.4572277 ]
R2 Scores : [0.07230665 0.08487616 0.11503577 0.11061146 0.07406302]
42.59931142076044
0.09137861021185652
47.85186082940176
0.12490483799012675


listt = [X_train_lr['proba-of-rec'], X_train_lr['sex']]
test = pd.concat(listt, axis=1)

listt1 = [X_test_lr['proba-of-rec'], X_test_lr['sex']]
test1 = pd.concat(listt1, axis = 1)

regressor = elasticnet.fit(test,Y_train_lr) 
y_pred = regressor.predict(test1)


mse=mean_squared_error(Y_test_lr, y_pred)

print(mse)


r_squared=r2_score(Y_test_lr, y_pred)
print(r_squared)

52.509838719152846
0.039721652939032404


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC

output = completed_dataset['class']
inputs = completed_dataset.drop(columns='class')

X_train, X_test, Y_train, Y_test = train_test_split(inputs, output, test_size = 0.2, random_state = 42)

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X_train_lr, Y_train_lr)
train = []
validation = []

for train_index, validation_index in skf.split(X_train, Y_train):
    train_groups= []
    validation_groups= []
    train_groups.append(train_index)
    validation_groups.append(validation_index)
    train.append(train_groups)
    validation.append(validation_groups)


regressor = LogisticRegression(multi_class = 'multinomial', max_iter=1000, solver = 'saga' , penalty = 'none', n_jobs = -1)
scores_mse = cross_val_score(regressor, X_train, Y_train, cv=skf, scoring = 'neg_mean_squared_error', n_jobs = -1)
scores_r2 = cross_val_score(regressor, X_train, Y_train, cv=skf, scoring = 'r2', n_jobs = -1)


print("MSE Scores: {}".format(scores_mse))
print('R2 Scores : {}'.format(scores_r2))
print(np.mean(np.absolute(scores_mse)))
print(np.mean(np.absolute(scores_r2)))

mse_results = []
r2_results = []

regressor = regressor.fit(X_train, Y_train)
y_pred = regressor.predict(X_test)


accuracy=accuracy_score(Y_test, y_pred)
print('Accuracy: {}'. format(accuracy))          

recall=recall_score(Y_test, y_pred, average = 'weighted')
print('Recall: {}'.format(recall))

/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:1173: FutureWarning: `penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:1173: FutureWarning: `penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:1173: FutureWarning: `penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:1173: FutureWarning: `penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:1173: FutureWarning: `penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:1173: FutureWarning: `penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:1173: FutureWarning: `penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:1173: FutureWarning: `penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:1173: FutureWarning: `penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:1173: FutureWarning: `penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:1173: FutureWarning: `penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.
  warnings.warn(

MSE Scores: [-38.33333333 -50.09259259 -72.49056604 -68.56603774 -77.0754717 ]
R2 Scores : [ 0.17376264 -0.1162908  -0.58943737 -0.42276098 -0.51833177]
61.311600279524804
0.3641167116532883
Accuracy: 0.44776119402985076
Recall: 0.44776119402985076

/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(


logreg = LogisticRegression(C = 20, solver='saga', penalty='l1', multi_class = 'multinomial', max_iter=2000, n_jobs = -1)

regressor=logreg.fit(X_train, Y_train)

y_pred_lasso = logreg.predict(X_test)
scores_mse = cross_val_score(logreg, X_train, Y_train, cv=skf, scoring = 'neg_mean_squared_error', n_jobs = -1)
scores_r2 = cross_val_score(logreg, X_train, Y_train, cv=skf, scoring = 'r2', n_jobs = -1)


print("MSE Scores: {}".format(scores_mse))
print('R2 Scores : {}'.format(scores_r2))
print(np.mean(np.absolute(scores_mse)))
print(np.mean(np.absolute(scores_r2)))

accuracy=accuracy_score(Y_test, y_pred_lasso)
print('Accuracy: {}'.format(accuracy))
              
recall_lasso=recall_score(Y_test, y_pred_lasso, average = 'weighted')
print('Recall: {}'.format(recall_lasso))

/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(

MSE Scores: [-38.33333333 -50.87037037 -61.39622642 -57.83018868 -78.41509434]
R2 Scores : [ 0.17376264 -0.13362323 -0.34618147 -0.19998966 -0.54472138]
57.36904262753319
0.27965567337702374
Accuracy: 0.44776119402985076
Recall: 0.44776119402985076

/home/aitorchagon/anaconda3/envs/Proyectos/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(


logreg = LogisticRegression(C = 10, solver='saga', penalty='l2', multi_class = 'multinomial', max_iter=2000, n_jobs = -1)

regressor=logreg.fit(X_train, Y_train)

y_pred_ridge = logreg.predict(X_test)
scores_mse = cross_val_score(logreg, X_train, Y_train, cv=skf, scoring = 'neg_mean_squared_error', n_jobs = -1)
scores_r2 = cross_val_score(logreg, X_train, Y_train, cv=skf, scoring = 'r2', n_jobs = -1)


print("MSE Scores: {}".format(scores_mse))
print('R2 Scores : {}'.format(scores_r2))
print(np.mean(np.absolute(scores_mse)))
print(np.mean(np.absolute(scores_r2)))

accuracy=accuracy_score(Y_test, y_pred_ridge)
print('Accuracy: {}'.format(accuracy))

recall_ridge=recall_score(Y_test, y_pred_ridge, average = 'weighted')
print('Recall: {}'.format(recall_ridge))

MSE Scores: [-34.87037037 -48.77777778 -63.0754717  -56.1509434  -56.96226415]
R2 Scores : [ 0.24840341 -0.08699075 -0.38300081 -0.16514493 -0.12211594]
51.96736547868623
0.2011311669427333
Accuracy: 0.4925373134328358
Recall: 0.4925373134328358


logreg = LogisticRegression(C = 10, solver='saga', penalty='elasticnet', multi_class = 'multinomial', max_iter=2000, n_jobs = -1, l1_ratio = 0.15)


regressor=logreg.fit(X_train, Y_train)


y_pred_en = logreg.predict(X_test)
scores_ecm = cross_val_score(logreg, X_train, Y_train, cv=skf, scoring = 'neg_mean_squared_error', n_jobs = -1)
scores_r2 = cross_val_score(logreg, X_train, Y_train, cv=skf, scoring = 'r2', n_jobs = -1)


print("MSE Scores: {}".format(scores_ecm))
print('R2 Scores : {}'.format(scores_r2))
print(np.mean(np.absolute(scores_ecm)))
print(np.mean(np.absolute(scores_r2)))


accuracy=accuracy_score(Y_test, y_pred_en)
print('Accuracy: {}'.format(accuracy))
            

recall_en=recall_score(Y_test, y_pred_en, average = 'weighted')
print('Recall: {}'.format(recall_en))

MSE Scores: [-34.87037037 -48.77777778 -63.0754717  -56.1509434  -56.96226415]
R2 Scores : [ 0.24840341 -0.08699075 -0.38300081 -0.16514493 -0.12211594]
51.96736547868623
0.2011311669427333
Accuracy: 0.4925373134328358
Recall: 0.4925373134328358

Feature	MSE	R2
Age	58	-0.003
Sex	50.52	0.125
Histologic-type	55	0.48
Degree-of-diffe	56.56	0.0213
Bone	57.47	0.005675
Bone-marrow	58.13	-0.00577
Lung	57.61	0.003
Pleura	57.85	-0.0008
Peritoneum	55.59	0.038
Liver	58.97	-0.02
Brain	54.395	0.05895
Skin	58.13	-0.006
Neck	58.13	-0.005
Supraclavicular	56.63	0.02
Axillar	50.91	0.119
Mediastinum	55.9	0.03
Abdominal	57.083	0.012

Introduction¶

Analysis code: Data visualization.¶

Preprocessing code: Integration, cleaning, transformation and data reduction.¶

Data cleaning¶

Data reduction Phase¶

Classification and regression using synthetic features: linear and non-linear¶

	class	age	sex	histologic-type	degree-of-diffe	bone	bone-marrow	lung	pleura	peritoneum	liver	brain	skin	neck	supraclavicular	axillar	mediastinum	abdominal
0	1	1	1	2	3	2	2	1	2	2	2	2	2	2	2	2	2	2
1	1	1	1	2	3	2	2	2	2	2	1	2	2	2	1	2	1	2
2	1	1	2	2	3	1	2	2	2	2	2	2	2	2	2	2	1	2
3	1	1	2	2	3	1	2	1	1	2	2	2	2	2	2	2	1	2
4	1	1	2	2	3	1	2	1	1	2	2	2	2	2	2	2	1	2
5	1	1	2	2	3	1	2	2	2	2	2	1	2	2	1	1	1	2
6	1	2	1	1	1	1	2	2	2	2	2	2	1	1	1	2	2	2
7	1	2	1	1	1	1	2	2	2	2	2	2	2	2	2	2	2	2
8	1	2	1	1	1	2	2	2	2	2	2	2	2	1	2	2	2	2
9	1	2	1	1	2	1	2	2	2	2	2	2	1	1	1	2	2	2
10	1	2	1	1	3	1	2	2	1	2	2	2	2	2	1	2	1	2
11	1	2	1	1	3	1	2	2	2	2	2	2	1	2	2	1	2	2
12	1	2	1	1	3	1	2	2	2	2	2	2	2	2	2	2	2	2
13	1	2	1	1	3	1	2	2	2	2	2	2	2	2	2	2	2	2
14	1	2	1	1	3	2	2	2	2	2	1	2	2	2	2	2	1	2
15	1	2	1	1	3	2	2	2	1	2	1	2	2	2	2	2	2	2
16	1	2	1	1	3	2	2	2	2	2	2	2	2	2	1	2	1	2
17	1	2	1	2	2	2	2	1	2	2	2	2	2	1	1	2	1	2
18	1	2	1	2	3	1	2	2	2	2	2	2	1	2	2	2	2	2
19	1	2	1	2	3	2	2	2	2	2	2	2	2	2	1	2	1	2

	age	sex	histologic-type	degree-of-diffe	bone	bone-marrow	lung	pleura	peritoneum	liver	brain	skin	neck	supraclavicular	axillar	mediastinum	abdominal	class
0	0	0	1	2	1	1	0	1	1	1	1	1	1	1	1	1	1	1
1	0	0	1	2	1	1	1	1	1	0	1	1	1	0	1	0	1	1
2	0	1	1	2	0	1	1	1	1	1	1	1	1	1	1	0	1	1
3	0	1	1	2	0	1	0	0	1	1	1	1	1	1	1	0	1	1
4	0	1	1	2	0	1	0	0	1	1	1	1	1	1	1	0	1	1
5	0	1	1	2	0	1	1	1	1	1	0	1	1	0	0	0	1	1
6	1	0	0	0	0	1	1	1	1	1	1	0	0	0	1	1	1	1
7	1	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	1
8	1	0	0	0	1	1	1	1	1	1	1	1	0	1	1	1	1	1
9	1	0	0	1	0	1	1	1	1	1	1	0	0	0	1	1	1	1
10	1	0	0	2	0	1	1	0	1	1	1	1	1	0	1	0	1	1
11	1	0	0	2	0	1	1	1	1	1	1	0	1	1	0	1	1	1
12	1	0	0	2	0	1	1	1	1	1	1	1	1	1	1	1	1	1
13	1	0	0	2	0	1	1	1	1	1	1	1	1	1	1	1	1	1
14	1	0	0	2	1	1	1	1	1	0	1	1	1	1	1	0	1	1
15	1	0	0	2	1	1	1	0	1	0	1	1	1	1	1	1	1	1
16	1	0	0	2	1	1	1	1	1	1	1	1	1	0	1	0	1	1
17	1	0	1	1	1	1	0	1	1	1	1	1	0	0	1	0	1	1
18	1	0	1	2	0	1	1	1	1	1	1	0	1	1	1	1	1	1
19	1	0	1	2	1	1	1	1	1	1	1	1	1	0	1	0	1	1

	class	age	sex	histologic-type	degree-of-diffe	bone	bone-marrow	lung	pleura	peritoneum	liver	brain	skin	neck	supraclavicular	axillar	mediastinum	abdominal
0	1	1	1	2	3	2	2	1	2	2	2	2	2	2	2	2	2	2
1	1	1	1	2	3	2	2	2	2	2	1	2	2	2	1	2	1	2
2	1	1	2	2	3	1	2	2	2	2	2	2	2	2	2	2	1	2
3	1	1	2	2	3	1	2	1	1	2	2	2	2	2	2	2	1	2
4	1	1	2	2	3	1	2	1	1	2	2	2	2	2	2	2	1	2
5	1	1	2	2	3	1	2	2	2	2	2	1	2	2	1	1	1	2
6	1	2	1	1	1	1	2	2	2	2	2	2	1	1	1	2	2	2
7	1	2	1	1	1	1	2	2	2	2	2	2	2	2	2	2	2	2
8	1	2	1	1	1	2	2	2	2	2	2	2	2	1	2	2	2	2
9	1	2	1	1	2	1	2	2	2	2	2	2	1	1	1	2	2	2
10	1	2	1	1	3	1	2	2	1	2	2	2	2	2	1	2	1	2
11	1	2	1	1	3	1	2	2	2	2	2	2	1	2	2	1	2	2
12	1	2	1	1	3	1	2	2	2	2	2	2	2	2	2	2	2	2
13	1	2	1	1	3	1	2	2	2	2	2	2	2	2	2	2	2	2
14	1	2	1	1	3	2	2	2	2	2	1	2	2	2	2	2	1	2
15	1	2	1	1	3	2	2	2	1	2	1	2	2	2	2	2	2	2
16	1	2	1	1	3	2	2	2	2	2	2	2	2	2	1	2	1	2
17	1	2	1	2	2	2	2	1	2	2	2	2	2	1	1	2	1	2
18	1	2	1	2	3	1	2	2	2	2	2	2	1	2	2	2	2	2
19	1	2	1	2	3	2	2	2	2	2	2	2	2	2	1	2	1	2

	age	sex	histologic-type	degree-of-diffe	bone	bone-marrow	lung	pleura	peritoneum	liver	brain	skin	neck	supraclavicular	axillar	mediastinum	abdominal	class
0	0	0	1	2	1	1	0	1	1	1	1	1	1	1	1	1	1	1
1	0	0	1	2	1	1	1	1	1	0	1	1	1	0	1	0	1	1
2	0	1	1	2	0	1	1	1	1	1	1	1	1	1	1	0	1	1
3	0	1	1	2	0	1	0	0	1	1	1	1	1	1	1	0	1	1
4	0	1	1	2	0	1	0	0	1	1	1	1	1	1	1	0	1	1
5	0	1	1	2	0	1	1	1	1	1	0	1	1	0	0	0	1	1
6	1	0	0	0	0	1	1	1	1	1	1	0	0	0	1	1	1	1
7	1	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	1
8	1	0	0	0	1	1	1	1	1	1	1	1	0	1	1	1	1	1
9	1	0	0	1	0	1	1	1	1	1	1	0	0	0	1	1	1	1
10	1	0	0	2	0	1	1	0	1	1	1	1	1	0	1	0	1	1
11	1	0	0	2	0	1	1	1	1	1	1	0	1	1	0	1	1	1
12	1	0	0	2	0	1	1	1	1	1	1	1	1	1	1	1	1	1
13	1	0	0	2	0	1	1	1	1	1	1	1	1	1	1	1	1	1
14	1	0	0	2	1	1	1	1	1	0	1	1	1	1	1	0	1	1
15	1	0	0	2	1	1	1	0	1	0	1	1	1	1	1	1	1	1
16	1	0	0	2	1	1	1	1	1	1	1	1	1	0	1	0	1	1
17	1	0	1	1	1	1	0	1	1	1	1	1	0	0	1	0	1	1
18	1	0	1	2	0	1	1	1	1	1	1	0	1	1	1	1	1	1
19	1	0	1	2	1	1	1	1	1	1	1	1	1	0	1	0	1	1

	class	age	sex	histologic-type	degree-of-diffe	bone	bone-marrow	lung	pleura	peritoneum	liver	brain	skin	neck	supraclavicular	axillar	mediastinum	abdominal
0	1	1	1	2	3	2	2	1	2	2	2	2	2	2	2	2	2	2
1	1	1	1	2	3	2	2	2	2	2	1	2	2	2	1	2	1	2
2	1	1	2	2	3	1	2	2	2	2	2	2	2	2	2	2	1	2
3	1	1	2	2	3	1	2	1	1	2	2	2	2	2	2	2	1	2
4	1	1	2	2	3	1	2	1	1	2	2	2	2	2	2	2	1	2
5	1	1	2	2	3	1	2	2	2	2	2	1	2	2	1	1	1	2
6	1	2	1	1	1	1	2	2	2	2	2	2	1	1	1	2	2	2
7	1	2	1	1	1	1	2	2	2	2	2	2	2	2	2	2	2	2
8	1	2	1	1	1	2	2	2	2	2	2	2	2	1	2	2	2	2
9	1	2	1	1	2	1	2	2	2	2	2	2	1	1	1	2	2	2
10	1	2	1	1	3	1	2	2	1	2	2	2	2	2	1	2	1	2
11	1	2	1	1	3	1	2	2	2	2	2	2	1	2	2	1	2	2
12	1	2	1	1	3	1	2	2	2	2	2	2	2	2	2	2	2	2
13	1	2	1	1	3	1	2	2	2	2	2	2	2	2	2	2	2	2
14	1	2	1	1	3	2	2	2	2	2	1	2	2	2	2	2	1	2
15	1	2	1	1	3	2	2	2	1	2	1	2	2	2	2	2	2	2
16	1	2	1	1	3	2	2	2	2	2	2	2	2	2	1	2	1	2
17	1	2	1	2	2	2	2	1	2	2	2	2	2	1	1	2	1	2
18	1	2	1	2	3	1	2	2	2	2	2	2	1	2	2	2	2	2
19	1	2	1	2	3	2	2	2	2	2	2	2	2	2	1	2	1	2

	age	sex	histologic-type	degree-of-diffe	bone	bone-marrow	lung	pleura	peritoneum	liver	brain	skin	neck	supraclavicular	axillar	mediastinum	abdominal	class
0	0	0	1	2	1	1	0	1	1	1	1	1	1	1	1	1	1	1
1	0	0	1	2	1	1	1	1	1	0	1	1	1	0	1	0	1	1
2	0	1	1	2	0	1	1	1	1	1	1	1	1	1	1	0	1	1
3	0	1	1	2	0	1	0	0	1	1	1	1	1	1	1	0	1	1
4	0	1	1	2	0	1	0	0	1	1	1	1	1	1	1	0	1	1
5	0	1	1	2	0	1	1	1	1	1	0	1	1	0	0	0	1	1
6	1	0	0	0	0	1	1	1	1	1	1	0	0	0	1	1	1	1
7	1	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	1
8	1	0	0	0	1	1	1	1	1	1	1	1	0	1	1	1	1	1
9	1	0	0	1	0	1	1	1	1	1	1	0	0	0	1	1	1	1
10	1	0	0	2	0	1	1	0	1	1	1	1	1	0	1	0	1	1
11	1	0	0	2	0	1	1	1	1	1	1	0	1	1	0	1	1	1
12	1	0	0	2	0	1	1	1	1	1	1	1	1	1	1	1	1	1
13	1	0	0	2	0	1	1	1	1	1	1	1	1	1	1	1	1	1
14	1	0	0	2	1	1	1	1	1	0	1	1	1	1	1	0	1	1
15	1	0	0	2	1	1	1	0	1	0	1	1	1	1	1	1	1	1
16	1	0	0	2	1	1	1	1	1	1	1	1	1	0	1	0	1	1
17	1	0	1	1	1	1	0	1	1	1	1	1	0	0	1	0	1	1
18	1	0	1	2	0	1	1	1	1	1	1	0	1	1	1	1	1	1
19	1	0	1	2	1	1	1	1	1	1	1	1	1	0	1	0	1	1