import pandas as pd
import numpy as np

integrins = pd.read_excel(r"C:\Users\QBPAM\Downloads\'25 summer BigData AI Cancer class by Yongmei Wang\gtex_integrin_7_organs.xlsx")
integrins

brain_lung = integrins[integrins['primary_site'].isin(['Brain', 'Lung'])]
brain_lung

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = brain_lung[['ITGA10']]   #a data frame; independent variable (only the ITGA10 gene expression is used as input to the model)
y = brain_lung['primary_site']   #dependent variable (label trying to predict (brain or lung))

#define split between train and test; 
#training the model on one portion of the data and evaluating it on another
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42) #70% training data, 30% testing data

#define model used: logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)  #trains model, Learn relationship between ITGA10 expression and tissue type



y_pred = model.predict(X_test)  #use trained model to predict tissue types for test data (not seen during training)
accuracy = accuracy_score(y_test, y_pred)  #compare predicted labels to true labels, calculates accuracy of predictions
print(f"Accuracy using ITGA10: {accuracy:.2f}")  #accuracy percentage (rounded to 2 decimal places).

Accuracy using ITGA10: 0.94

#switch ITGA10 to ITGB4, see impact on accurancy

X = brain_lung[['ITGB4']]
y = brain_lung['primary_site']

#define split between train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #60% training data, 40% testing data

#define model used: logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGB4: {accuracy:.2f}")

Accuracy using ITGB4: 0.82

len(y_test)   #number of samples (rows) in test set

576

#AUROC curve 
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

X = brain_lung[['ITGA10']]
y = brain_lung['primary_site'].map({'Brain': 0, 'Lung': 1})  #binary encoding

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

model = LogisticRegression()
model.fit(X_train, y_train)

#predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  #probabilities for "Lung"

#compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

#plot
plt.figure(figsize = (6, 6))
plt.plot(fpr, tpr, label = f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle = '--', color = 'gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Brain vs Lung) using ITGA10 expression')
plt.legend()
plt.grid(True)
plt.show()

#AUROC curve 
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

X = brain_lung[['ITGB4']]
y = brain_lung['primary_site'].map({'Brain': 0, 'Lung': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_proba = model.predict_proba(X_test)[:, 1]  # probabilities for Lung

#compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

#plot
plt.figure(figsize = (6, 6))
plt.plot(fpr, tpr, label = f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle = '--', color = 'gray')  #random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Brain vs Lung) using ITGB4 expression')
plt.legend()
plt.grid(True)
plt.show()

#AUROC curve using two integrins
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

X = brain_lung[['ITGA3','ITGB4']]
y = brain_lung['primary_site'].map({'Brain': 0, 'Lung': 1})  # Binary encoding

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_proba = model.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

plt.figure(figsize = (6, 6))
plt.plot(fpr, tpr, label = f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle = '--', color = 'gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Brain vs Lung) using ITGA3 and ITGB4expression')
plt.legend()
plt.grid(True)
plt.show()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


selected_genes = ['ITGA10', 'ITGB4'] 
#X = integrins.iloc[:, -27:]  # Assuming the last 27 columns are integrins
X = integrins[selected_genes]
y = integrins['primary_site']

#encode organ labels as numbers
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.2, random_state = 42)

#train multinomial logistic regression
#model = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 1000)
#Scikit-learn defaults to multinomial when lbfgs is used, warning message if code includes multi-class = 'multinomial' because it will become outdated
model = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names = le.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7939698492462312

Classification Report:
              precision    recall  f1-score   support

 Bone Marrow       0.77      1.00      0.87        10
       Brain       0.81      0.94      0.87       247
      Breast       0.64      0.41      0.50        44
       Liver       1.00      0.65      0.79        23
        Lung       0.76      0.88      0.82        43
       Ovary       0.50      0.10      0.17        10
    Prostate       0.75      0.14      0.24        21

    accuracy                           0.79       398
   macro avg       0.75      0.59      0.61       398
weighted avg       0.78      0.79      0.77       398


Confusion Matrix:
[[ 10   0   0   0   0   0   0]
 [  3 231   3   0   8   1   1]
 [  0  25  18   0   1   0   0]
 [  0   8   0  15   0   0   0]
 [  0   4   1   0  38   0   0]
 [  0   6   0   0   3   1   0]
 [  0  12   6   0   0   0   3]]

#format the confusion matrix for visual
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)   #compute raw confusion matrix
plt.figure(figsize = (8, 6))
sns.heatmap(cm, annot = True, fmt = 'd', cmap = 'Blues', 
            xticklabels = le.classes_, yticklabels = le.classes_)   
#annot add numbers in cells, fmt format as integers, cmap blue color gradient, ticklabels shows class (tissue type) names
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

integrins['primary_site'].value_counts()

primary_site
Brain          1152
Lung            288
Breast          179
Liver           110
Prostate        100
Ovary            88
Bone Marrow      70
Name: count, dtype: int64

integrins.shape   #(rows, columns)

(1987, 28)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.79

	primary_site	ITGA10	ITGAD	ITGAM	ITGA3	ITGBL1	ITGAE	ITGA2	ITGB3	ITGA7	...	ITGA6	ITGA2B	ITGB1	ITGAL	ITGA9	ITGB5	ITGA8	ITGA4	ITGA1	ITGA11
0	Brain	0.5763	-6.5064	2.2573	0.7832	1.0363	4.6035	2.5731	-2.8262	4.9663	...	2.8562	1.3846	5.8430	1.1316	-0.7108	3.5387	-0.0725	-0.4521	0.2029	-2.8262
1	Lung	4.9137	-3.6259	4.7307	7.1584	1.7702	4.9556	1.9149	2.6067	3.9270	...	4.2412	4.1211	7.7256	4.4900	2.9281	6.1483	5.1867	2.6185	4.7856	-0.0277
2	Ovary	2.3953	-5.0116	1.4547	4.2593	-0.7346	4.4149	0.2642	1.5216	4.3492	...	3.6816	1.5465	7.2964	-0.9406	2.7742	5.0414	2.0325	0.7579	2.2573	1.2516
3	Lung	4.0541	-2.3147	4.5053	7.5651	4.1788	4.1772	5.3695	1.8444	4.5355	...	4.9631	1.9149	7.9947	3.3911	2.8462	6.7683	4.1636	2.7951	5.3284	1.2147
4	Breast	2.0569	-2.4659	3.3993	3.1311	3.0074	4.4977	-1.7809	2.7139	7.8698	...	4.7340	0.6332	7.3496	-0.9406	2.5338	6.5696	1.7229	-0.6416	3.1195	1.1050
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1982	Lung	5.3067	-3.8160	4.9065	7.5810	5.8714	4.7345	2.6185	3.1095	5.2032	...	5.6080	3.7324	8.2849	4.6201	3.6440	6.7052	5.1094	3.3364	5.8153	1.6604
1983	Prostate	2.9581	-4.6082	1.1641	4.6938	1.5902	5.8625	-0.5125	1.7617	7.4152	...	3.8798	-1.4699	7.5163	-0.3752	2.9562	5.3035	4.4304	-0.9406	3.6136	0.4233
1984	Breast	4.3184	-6.5064	1.0433	4.8440	3.5498	4.6809	1.0293	3.3478	6.2136	...	5.3256	-0.0725	7.7516	1.1382	2.1411	7.1132	0.3796	0.0854	3.8650	1.0151
1985	Brain	3.4622	-5.5735	1.5013	5.4835	1.7702	4.7517	0.6790	-3.1714	5.3597	...	1.1960	4.1740	4.3002	0.5470	-0.9971	3.7982	-0.2498	1.4808	-0.5125	-0.5125
1986	Lung	2.5585	-1.7809	6.7916	6.5865	2.7051	4.9519	4.3618	3.1892	7.7121	...	3.5779	2.8974	7.7685	4.8294	1.9149	5.9989	2.4117	2.4198	4.2080	1.0007

	primary_site	ITGA10	ITGAD	ITGAM	ITGA3	ITGBL1	ITGAE	ITGA2	ITGB3	ITGA7	...	ITGA6	ITGA2B	ITGB1	ITGAL	ITGA9	ITGB5	ITGA8	ITGA4	ITGA1	ITGA11
0	Brain	0.5763	-6.5064	2.2573	0.7832	1.0363	4.6035	2.5731	-2.8262	4.9663	...	2.8562	1.3846	5.8430	1.1316	-0.7108	3.5387	-0.0725	-0.4521	0.2029	-2.8262
1	Lung	4.9137	-3.6259	4.7307	7.1584	1.7702	4.9556	1.9149	2.6067	3.9270	...	4.2412	4.1211	7.7256	4.4900	2.9281	6.1483	5.1867	2.6185	4.7856	-0.0277
3	Lung	4.0541	-2.3147	4.5053	7.5651	4.1788	4.1772	5.3695	1.8444	4.5355	...	4.9631	1.9149	7.9947	3.3911	2.8462	6.7683	4.1636	2.7951	5.3284	1.2147
5	Lung	6.0732	-2.4659	3.9901	7.3945	4.7688	5.1157	4.3356	2.3366	5.0527	...	3.7378	4.7247	7.5016	5.1396	2.5036	6.5443	4.6531	3.8136	5.8679	0.7407
6	Lung	4.2510	-5.0116	3.3076	6.1715	3.1129	5.2954	2.2960	1.1184	5.2392	...	4.7104	2.7530	7.5022	4.0730	2.6325	6.0483	5.0562	2.6962	5.1611	0.9343
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1980	Brain	0.6969	-6.5064	-0.9686	2.3760	-2.2447	4.0739	-0.6193	-4.0350	4.8788	...	2.7357	1.5806	4.6882	-0.9971	-0.5756	3.5136	0.9343	-1.0862	0.4340	-2.2447
1981	Brain	0.1124	-5.0116	2.2482	2.8897	-0.5125	4.6445	0.3115	-3.6259	4.5110	...	2.1147	0.9716	5.1202	0.6608	0.4761	3.2343	0.8408	-0.0574	-0.1828	-2.5479
1982	Lung	5.3067	-3.8160	4.9065	7.5810	5.8714	4.7345	2.6185	3.1095	5.2032	...	5.6080	3.7324	8.2849	4.6201	3.6440	6.7052	5.1094	3.3364	5.8153	1.6604
1985	Brain	3.4622	-5.5735	1.5013	5.4835	1.7702	4.7517	0.6790	-3.1714	5.3597	...	1.1960	4.1740	4.3002	0.5470	-0.9971	3.7982	-0.2498	1.4808	-0.5125	-0.5125
1986	Lung	2.5585	-1.7809	6.7916	6.5865	2.7051	4.9519	4.3618	3.1892	7.7121	...	3.5779	2.8974	7.7685	4.8294	1.9149	5.9989	2.4117	2.4198	4.2080	1.0007

Machine Learning with Data on Integrin Genes¶

Receiver Operating Characteristic Curves and the Area Under the Curves¶

Machine Learning Model Performing Multiclass Classification¶