import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

integrins = pd.read_excel(r"C:\Users\QBPAM\Downloads\'25 summer BigData AI Cancer class by Yongmei Wang\gtex_integrin_7_organs.xlsx")
integrins

ovary_breast = integrins[integrins['primary_site'].isin(['Ovary', 'Breast'])]     #filter data by organ, display both ovary and breast data

#rearrange data
ovary_breast_vertical = ovary_breast_integrins.melt(id_vars = 'primary_site', var_name = 'integrin_gene', value_name = 'expression_levels')
ovary_breast_vertical

plt.figure(figsize=(16, 6))
sns.violinplot(x = 'integrin_gene', y = 'expression_levels', hue = 'primary_site', data = ovary_breast_vertical, split = True, inner = 'quartile')
plt.title("Integrin Genes of the Ovary vs. the Breast")
plt.xlabel("Integrin Gene")
plt.ylabel("Gene Expression Levels")
plt.legend(title = 'primary_site')
plt.show()

#choosing integrin ITGA1 will give good performance, since we can see from the violin plot that the gene expression levels are more clearly distinct

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = ovary_breast[['ITGA1']]   #a data frame; independent variable (only the ITGA1 gene expression is used as input to the model)
y = ovary_breast['primary_site']   #dependent variable (label trying to predict (ovary or breast))

#define split between train and test; 
#training the model on one portion of the data and evaluating it on another
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42) #70% training data, 30% testing data

#define model used: logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)  #trains model, Learn relationship between ITGA1 expression and tissue type



y_pred = model.predict(X_test)  #use trained model to predict tissue types for test data (not seen during training)
accuracy = accuracy_score(y_test, y_pred)  #compare predicted labels to true labels, calculates accuracy of predictions
print(f"Accuracy using ITGA1: {accuracy:.2f}")  #accuracy percentage (rounded to 2 decimal places).

Accuracy using ITGA1: 0.94

#AUROC curve 
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

X = ovary_breast[['ITGA1']]
y = ovary_breast['primary_site'].map({'Ovary': 0, 'Breast': 1})  #binary encoding, negative and positive

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

model = LogisticRegression()
model.fit(X_train, y_train)

#predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  #probabilities for "breast"

#compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

#plot
plt.figure(figsize = (6, 6))
plt.plot(fpr, tpr, label = f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle = '--', color = 'gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Ovary vs Breast) using ITGA1 expression')
plt.legend()
plt.grid(True)
plt.show()

#switch ITGA1 to ITGA11, accuracy will decrease, based on split violin plot, since means of gene expression for each tissue are similar

X = ovary_breast[['ITGA11']]
y = ovary_breast['primary_site']

#define split between train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #60% training data, 40% testing data

#define model used: logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA11: {accuracy:.2f}")

Accuracy using ITGA11: 0.67

#AUROC curve 
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

X = ovary_breast[['ITGA11']]
y = ovary_breast['primary_site'].map({'Ovary': 0, 'Breast': 1})  #binary encoding, negative and positive

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

model = LogisticRegression()
model.fit(X_train, y_train)

#predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  #probabilities for "breast"

#compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

#plot
plt.figure(figsize = (6, 6))
plt.plot(fpr, tpr, label = f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle = '--', color = 'gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Ovary vs Breast) using ITGA11 expression')
plt.legend()
plt.grid(True)
plt.show()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


selected_genes = ['ITGA1', 'ITGA11'] 
#X = integrins.iloc[:, -27:]  # Assuming the last 27 columns are integrins
X = integrins[selected_genes]
y = integrins['primary_site']

#encode organ labels as numbers
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.2, random_state = 42)

#train multinomial logistic regression
#model = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 1000)
#Scikit-learn defaults to multinomial when lbfgs is used, warning message if code includes multi-class = 'multinomial' because it will become outdated
model = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names = le.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8618090452261307

Classification Report:
              precision    recall  f1-score   support

 Bone Marrow       0.78      0.70      0.74        10
       Brain       0.96      0.98      0.97       247
      Breast       0.65      0.55      0.59        44
       Liver       0.95      0.83      0.88        23
        Lung       0.70      0.93      0.80        43
       Ovary       0.38      0.50      0.43        10
    Prostate       0.58      0.33      0.42        21

    accuracy                           0.86       398
   macro avg       0.72      0.69      0.69       398
weighted avg       0.86      0.86      0.86       398


Confusion Matrix:
[[  7   3   0   0   0   0   0]
 [  2 241   1   0   0   3   0]
 [  0   3  24   0   9   4   4]
 [  0   0   0  19   4   0   0]
 [  0   0   2   1  40   0   0]
 [  0   3   1   0   0   5   1]
 [  0   0   9   0   4   1   7]]

#format the confusion matrix for visual
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)   #compute raw confusion matrix
plt.figure(figsize = (8, 6))
sns.heatmap(cm, annot = True, fmt = 'd', cmap = 'Blues', 
            xticklabels = le.classes_, yticklabels = le.classes_)   
#annot add numbers in cells, fmt format as integers, cmap blue color gradient, ticklabels shows class (tissue type) names
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

	primary_site	ITGA10	ITGAD	ITGAM	ITGA3	ITGBL1	ITGAE	ITGA2	ITGB3	ITGA7	...	ITGA6	ITGA2B	ITGB1	ITGAL	ITGA9	ITGB5	ITGA8	ITGA4	ITGA1	ITGA11
0	Brain	0.5763	-6.5064	2.2573	0.7832	1.0363	4.6035	2.5731	-2.8262	4.9663	...	2.8562	1.3846	5.8430	1.1316	-0.7108	3.5387	-0.0725	-0.4521	0.2029	-2.8262
1	Lung	4.9137	-3.6259	4.7307	7.1584	1.7702	4.9556	1.9149	2.6067	3.9270	...	4.2412	4.1211	7.7256	4.4900	2.9281	6.1483	5.1867	2.6185	4.7856	-0.0277
2	Ovary	2.3953	-5.0116	1.4547	4.2593	-0.7346	4.4149	0.2642	1.5216	4.3492	...	3.6816	1.5465	7.2964	-0.9406	2.7742	5.0414	2.0325	0.7579	2.2573	1.2516
3	Lung	4.0541	-2.3147	4.5053	7.5651	4.1788	4.1772	5.3695	1.8444	4.5355	...	4.9631	1.9149	7.9947	3.3911	2.8462	6.7683	4.1636	2.7951	5.3284	1.2147
4	Breast	2.0569	-2.4659	3.3993	3.1311	3.0074	4.4977	-1.7809	2.7139	7.8698	...	4.7340	0.6332	7.3496	-0.9406	2.5338	6.5696	1.7229	-0.6416	3.1195	1.1050
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1982	Lung	5.3067	-3.8160	4.9065	7.5810	5.8714	4.7345	2.6185	3.1095	5.2032	...	5.6080	3.7324	8.2849	4.6201	3.6440	6.7052	5.1094	3.3364	5.8153	1.6604
1983	Prostate	2.9581	-4.6082	1.1641	4.6938	1.5902	5.8625	-0.5125	1.7617	7.4152	...	3.8798	-1.4699	7.5163	-0.3752	2.9562	5.3035	4.4304	-0.9406	3.6136	0.4233
1984	Breast	4.3184	-6.5064	1.0433	4.8440	3.5498	4.6809	1.0293	3.3478	6.2136	...	5.3256	-0.0725	7.7516	1.1382	2.1411	7.1132	0.3796	0.0854	3.8650	1.0151
1985	Brain	3.4622	-5.5735	1.5013	5.4835	1.7702	4.7517	0.6790	-3.1714	5.3597	...	1.1960	4.1740	4.3002	0.5470	-0.9971	3.7982	-0.2498	1.4808	-0.5125	-0.5125
1986	Lung	2.5585	-1.7809	6.7916	6.5865	2.7051	4.9519	4.3618	3.1892	7.7121	...	3.5779	2.8974	7.7685	4.8294	1.9149	5.9989	2.4117	2.4198	4.2080	1.0007

	primary_site	integrin_gene	expression_levels
0	Ovary	ITGA10	2.3953
1	Breast	ITGA10	2.0569
2	Ovary	ITGA10	1.1184
3	Breast	ITGA10	3.0305
4	Ovary	ITGA10	2.8260
...	...	...	...
7204	Ovary	ITGA11	2.6940
7205	Breast	ITGA11	-0.0425
7206	Breast	ITGA11	1.5563
7207	Breast	ITGA11	2.3926
7208	Breast	ITGA11	1.0151

Split Violin Plot and Binary Classification with ROC Curves¶