Split Violin Plot and Binary Classification with ROC Curves¶
In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
In [3]:
integrins = pd.read_excel(r"C:\Users\QBPAM\Downloads\'25 summer BigData AI Cancer class by Yongmei Wang\gtex_integrin_7_organs.xlsx")
integrins
Out[3]:
primary_site | ITGA10 | ITGAD | ITGAM | ITGA3 | ITGBL1 | ITGAE | ITGA2 | ITGB3 | ITGA7 | ... | ITGA6 | ITGA2B | ITGB1 | ITGAL | ITGA9 | ITGB5 | ITGA8 | ITGA4 | ITGA1 | ITGA11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Brain | 0.5763 | -6.5064 | 2.2573 | 0.7832 | 1.0363 | 4.6035 | 2.5731 | -2.8262 | 4.9663 | ... | 2.8562 | 1.3846 | 5.8430 | 1.1316 | -0.7108 | 3.5387 | -0.0725 | -0.4521 | 0.2029 | -2.8262 |
1 | Lung | 4.9137 | -3.6259 | 4.7307 | 7.1584 | 1.7702 | 4.9556 | 1.9149 | 2.6067 | 3.9270 | ... | 4.2412 | 4.1211 | 7.7256 | 4.4900 | 2.9281 | 6.1483 | 5.1867 | 2.6185 | 4.7856 | -0.0277 |
2 | Ovary | 2.3953 | -5.0116 | 1.4547 | 4.2593 | -0.7346 | 4.4149 | 0.2642 | 1.5216 | 4.3492 | ... | 3.6816 | 1.5465 | 7.2964 | -0.9406 | 2.7742 | 5.0414 | 2.0325 | 0.7579 | 2.2573 | 1.2516 |
3 | Lung | 4.0541 | -2.3147 | 4.5053 | 7.5651 | 4.1788 | 4.1772 | 5.3695 | 1.8444 | 4.5355 | ... | 4.9631 | 1.9149 | 7.9947 | 3.3911 | 2.8462 | 6.7683 | 4.1636 | 2.7951 | 5.3284 | 1.2147 |
4 | Breast | 2.0569 | -2.4659 | 3.3993 | 3.1311 | 3.0074 | 4.4977 | -1.7809 | 2.7139 | 7.8698 | ... | 4.7340 | 0.6332 | 7.3496 | -0.9406 | 2.5338 | 6.5696 | 1.7229 | -0.6416 | 3.1195 | 1.1050 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1982 | Lung | 5.3067 | -3.8160 | 4.9065 | 7.5810 | 5.8714 | 4.7345 | 2.6185 | 3.1095 | 5.2032 | ... | 5.6080 | 3.7324 | 8.2849 | 4.6201 | 3.6440 | 6.7052 | 5.1094 | 3.3364 | 5.8153 | 1.6604 |
1983 | Prostate | 2.9581 | -4.6082 | 1.1641 | 4.6938 | 1.5902 | 5.8625 | -0.5125 | 1.7617 | 7.4152 | ... | 3.8798 | -1.4699 | 7.5163 | -0.3752 | 2.9562 | 5.3035 | 4.4304 | -0.9406 | 3.6136 | 0.4233 |
1984 | Breast | 4.3184 | -6.5064 | 1.0433 | 4.8440 | 3.5498 | 4.6809 | 1.0293 | 3.3478 | 6.2136 | ... | 5.3256 | -0.0725 | 7.7516 | 1.1382 | 2.1411 | 7.1132 | 0.3796 | 0.0854 | 3.8650 | 1.0151 |
1985 | Brain | 3.4622 | -5.5735 | 1.5013 | 5.4835 | 1.7702 | 4.7517 | 0.6790 | -3.1714 | 5.3597 | ... | 1.1960 | 4.1740 | 4.3002 | 0.5470 | -0.9971 | 3.7982 | -0.2498 | 1.4808 | -0.5125 | -0.5125 |
1986 | Lung | 2.5585 | -1.7809 | 6.7916 | 6.5865 | 2.7051 | 4.9519 | 4.3618 | 3.1892 | 7.7121 | ... | 3.5779 | 2.8974 | 7.7685 | 4.8294 | 1.9149 | 5.9989 | 2.4117 | 2.4198 | 4.2080 | 1.0007 |
1987 rows × 28 columns
In [9]:
ovary_breast = integrins[integrins['primary_site'].isin(['Ovary', 'Breast'])] #filter data by organ, display both ovary and breast data
#rearrange data
ovary_breast_vertical = ovary_breast_integrins.melt(id_vars = 'primary_site', var_name = 'integrin_gene', value_name = 'expression_levels')
ovary_breast_vertical
Out[9]:
primary_site | integrin_gene | expression_levels | |
---|---|---|---|
0 | Ovary | ITGA10 | 2.3953 |
1 | Breast | ITGA10 | 2.0569 |
2 | Ovary | ITGA10 | 1.1184 |
3 | Breast | ITGA10 | 3.0305 |
4 | Ovary | ITGA10 | 2.8260 |
... | ... | ... | ... |
7204 | Ovary | ITGA11 | 2.6940 |
7205 | Breast | ITGA11 | -0.0425 |
7206 | Breast | ITGA11 | 1.5563 |
7207 | Breast | ITGA11 | 2.3926 |
7208 | Breast | ITGA11 | 1.0151 |
7209 rows × 3 columns
In [10]:
plt.figure(figsize=(16, 6))
sns.violinplot(x = 'integrin_gene', y = 'expression_levels', hue = 'primary_site', data = ovary_breast_vertical, split = True, inner = 'quartile')
plt.title("Integrin Genes of the Ovary vs. the Breast")
plt.xlabel("Integrin Gene")
plt.ylabel("Gene Expression Levels")
plt.legend(title = 'primary_site')
plt.show()
In [15]:
#choosing integrin ITGA1 will give good performance, since we can see from the violin plot that the gene expression levels are more clearly distinct
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
X = ovary_breast[['ITGA1']] #a data frame; independent variable (only the ITGA1 gene expression is used as input to the model)
y = ovary_breast['primary_site'] #dependent variable (label trying to predict (ovary or breast))
#define split between train and test;
#training the model on one portion of the data and evaluating it on another
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42) #70% training data, 30% testing data
#define model used: logistic regression
model = LogisticRegression()
model.fit(X_train, y_train) #trains model, Learn relationship between ITGA1 expression and tissue type
y_pred = model.predict(X_test) #use trained model to predict tissue types for test data (not seen during training)
accuracy = accuracy_score(y_test, y_pred) #compare predicted labels to true labels, calculates accuracy of predictions
print(f"Accuracy using ITGA1: {accuracy:.2f}") #accuracy percentage (rounded to 2 decimal places).
Accuracy using ITGA1: 0.94
In [17]:
#AUROC curve
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
X = ovary_breast[['ITGA1']]
y = ovary_breast['primary_site'].map({'Ovary': 0, 'Breast': 1}) #binary encoding, negative and positive
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
model = LogisticRegression()
model.fit(X_train, y_train)
#predict probabilities
y_proba = model.predict_proba(X_test)[:, 1] #probabilities for "breast"
#compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
#plot
plt.figure(figsize = (6, 6))
plt.plot(fpr, tpr, label = f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle = '--', color = 'gray') # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Ovary vs Breast) using ITGA1 expression')
plt.legend()
plt.grid(True)
plt.show()
In [23]:
#switch ITGA1 to ITGA11, accuracy will decrease, based on split violin plot, since means of gene expression for each tissue are similar
X = ovary_breast[['ITGA11']]
y = ovary_breast['primary_site']
#define split between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #60% training data, 40% testing data
#define model used: logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA11: {accuracy:.2f}")
Accuracy using ITGA11: 0.67
In [24]:
#AUROC curve
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
X = ovary_breast[['ITGA11']]
y = ovary_breast['primary_site'].map({'Ovary': 0, 'Breast': 1}) #binary encoding, negative and positive
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
model = LogisticRegression()
model.fit(X_train, y_train)
#predict probabilities
y_proba = model.predict_proba(X_test)[:, 1] #probabilities for "breast"
#compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
#plot
plt.figure(figsize = (6, 6))
plt.plot(fpr, tpr, label = f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle = '--', color = 'gray') # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Ovary vs Breast) using ITGA11 expression')
plt.legend()
plt.grid(True)
plt.show()
Accuracy = 0.67 indicates that the model predicted the correct class (Ovary or Breast) 67% of the time at threshold = 0.5. However the AUC = 0.53, which is lower meaning that across all possible thresholds, the model is barely better than random guessing.
In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
selected_genes = ['ITGA1', 'ITGA11']
#X = integrins.iloc[:, -27:] # Assuming the last 27 columns are integrins
X = integrins[selected_genes]
y = integrins['primary_site']
#encode organ labels as numbers
le = LabelEncoder()
y_encoded = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.2, random_state = 42)
#train multinomial logistic regression
#model = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 1000)
#Scikit-learn defaults to multinomial when lbfgs is used, warning message if code includes multi-class = 'multinomial' because it will become outdated
model = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names = le.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
Accuracy: 0.8618090452261307 Classification Report: precision recall f1-score support Bone Marrow 0.78 0.70 0.74 10 Brain 0.96 0.98 0.97 247 Breast 0.65 0.55 0.59 44 Liver 0.95 0.83 0.88 23 Lung 0.70 0.93 0.80 43 Ovary 0.38 0.50 0.43 10 Prostate 0.58 0.33 0.42 21 accuracy 0.86 398 macro avg 0.72 0.69 0.69 398 weighted avg 0.86 0.86 0.86 398 Confusion Matrix: [[ 7 3 0 0 0 0 0] [ 2 241 1 0 0 3 0] [ 0 3 24 0 9 4 4] [ 0 0 0 19 4 0 0] [ 0 0 2 1 40 0 0] [ 0 3 1 0 0 5 1] [ 0 0 9 0 4 1 7]]
In [26]:
#format the confusion matrix for visual
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred) #compute raw confusion matrix
plt.figure(figsize = (8, 6))
sns.heatmap(cm, annot = True, fmt = 'd', cmap = 'Blues',
xticklabels = le.classes_, yticklabels = le.classes_)
#annot add numbers in cells, fmt format as integers, cmap blue color gradient, ticklabels shows class (tissue type) names
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
In [ ]: