import pandas as pd
import matplotlib.pyplot as plt
TCGA_BRCA_MC3_Public = pd.read_csv(r"C:\Users\QBPAM\Downloads\'25 summer BigData AI Cancer class by Yongmei Wang\mc3_BRCA_mc3.txt.gz", sep = '\t')
TCGA_BRCA_MC3_Public
sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | TCGA-3C-AAAU-01 | 10 | 122668955 | 122668955 | G | A | WDR11 | 3'UTR | NaN | 0.39 | NaN | NaN |
1 | TCGA-3C-AAAU-01 | 10 | 8115874 | 8115875 | - | A | GATA3 | Frame_Shift_Ins | p.P409Afs*99 | 0.34 | NaN | NaN |
2 | TCGA-3C-AAAU-01 | 11 | 65272906 | 65272908 | AAA | - | MALAT1 | RNA | NaN | 0.27 | NaN | NaN |
3 | TCGA-3C-AAAU-01 | 11 | 66082467 | 66082467 | C | T | CD248 | Missense_Mutation | p.E678K | 0.07 | tolerated(0.12) | benign(0.001) |
4 | TCGA-3C-AAAU-01 | 11 | 66193652 | 66193652 | G | C | NPAS4 | 3'UTR | NaN | 0.20 | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
92114 | TCGA-Z7-A8R6-01 | 9 | 95396703 | 95396703 | C | T | IPPK | Missense_Mutation | p.E379K | 0.16 | deleterious(0.01) | probably_damaging(0.968) |
92115 | TCGA-Z7-A8R6-01 | X | 123217344 | 123217344 | C | T | STAG2 | Missense_Mutation | p.L1000F | 0.39 | deleterious(0) | probably_damaging(1) |
92116 | TCGA-Z7-A8R6-01 | X | 30671631 | 30671631 | G | A | GK | 5'UTR | NaN | 0.36 | NaN | NaN |
92117 | TCGA-Z7-A8R6-01 | X | 51151398 | 51151398 | C | G | CXorf67 | 3'UTR | NaN | 0.32 | NaN | NaN |
92118 | TCGA-Z7-A8R6-01 | X | 54014379 | 54014379 | T | A | PHF8 | Splice_Site | p.X613_splice | 0.07 | NaN | NaN |
92119 rows × 12 columns
#effect columns give types of mutations present
mutation_type_counts = TCGA_BRCA_MC3_Public['effect'].value_counts()
mutation_type_counts
effect Missense_Mutation 45634 Silent 17122 Frame_Shift_Del 8522 3'UTR 6695 Nonsense_Mutation 3666 Intron 3212 5'UTR 2492 Splice_Site 1399 RNA 1160 Frame_Shift_Ins 610 3'Flank 530 5'Flank 443 In_Frame_Del 441 Translation_Start_Site 74 Nonstop_Mutation 66 In_Frame_Ins 34 large deletion 19 Name: count, dtype: int64
sample_types = TCGA_BRCA_MC3_Public['sample'].value_counts()
sample_types
sample TCGA-AC-A23H-01 6405 TCGA-EW-A2FV-01 4231 TCGA-D8-A27V-01 3332 TCGA-5L-AAT1-01 1995 TCGA-BH-A18G-01 1899 ... TCGA-AO-A03U-01 7 TCGA-A2-A25F-01 6 TCGA-LL-A440-01 6 TCGA-AC-A2FK-01 3 TCGA-EW-A1P1-01 3 Name: count, Length: 791, dtype: int64
gene_types = TCGA_BRCA_MC3_Public['gene'].value_counts()
gene_types
gene PIK3CA 315 TTN 285 TP53 273 MUC16 141 CDH1 108 ... DHFR 1 ZNF354B 1 HSP90AB2P 1 RP5-828H9.1 1 C20orf78 1 Name: count, Length: 18065, dtype: int64
gene_types_top50 = gene_types.head(50)
plt.figure(figsize = (10,8))
gene_types_top50.plot(kind = 'barh', color = 'magenta', edgecolor = 'black')
plt.xlabel("Number of Mutations")
plt.ylabel("Gene")
plt.title("Top 50 Most Frequently Mutated Genes in TCGA BRCA (MC3)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
TCGA_LAML = pd.read_csv(r"C:\Users\QBPAM\Downloads\'25 summer BigData AI Cancer class by Yongmei Wang\TCGA.LAML.sampleMap_mutation_wustl.gz", sep = '\t')
TCGA_LAML
sample | chr | start | end | reference | alt | gene | effect | DNA_VAF | RNA_VAF | Amino_Acid_Change | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | TCGA-AB-2802-03 | chrMT | 14767 | 14767 | T | C | MT-CYB | Missense_Mutation | NaN | NaN | p.I7T |
1 | TCGA-AB-2802-03 | chr1 | 119270684 | 119270684 | T | A | TBX15 | Missense_Mutation | NaN | NaN | p.I59F |
2 | TCGA-AB-2802-03 | chr1 | 150324146 | 150324146 | T | C | TCHHL1 | Missense_Mutation | NaN | NaN | p.Q879R |
3 | TCGA-AB-2802-03 | chr2 | 25310747 | 25310747 | G | A | DNMT3A | Missense_Mutation | NaN | NaN | p.R882C |
4 | TCGA-AB-2802-03 | chr2 | 208821357 | 208821357 | C | T | IDH1 | Missense_Mutation | NaN | NaN | p.R132H |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2580 | TCGA-AB-3012-03 | chr19 | 52567974 | 52567974 | G | A | DHX34 | Missense_Mutation | NaN | NaN | p.V648I |
2581 | TCGA-AB-3012-03 | chr22 | 20654724 | 20654724 | C | T | TOP3B | Missense_Mutation | NaN | NaN | p.V147M |
2582 | TCGA-AB-3012-03 | chr22 | 28018596 | 28018596 | G | T | EWSR1 | Splice_Site | NaN | NaN | e11+1 |
2583 | TCGA-AB-3012-03 | chr12 | 11883488 | 11883500 | ATCGATCTCCTCA | - | ETV6 | Frame_Shift_Del | NaN | NaN | p.Y104fs |
2584 | TCGA-AB-3012-03 | chr16 | 2752700 | 2752701 | - | TC | SRRM2 | Frame_Shift_Ins | NaN | NaN | p.S724fs |
2585 rows × 11 columns
mutation_type_counts1 = TCGA_LAML['effect'].value_counts()
mutation_type_counts1
effect Missense_Mutation 1539 Silent 510 Frame_Shift_Ins 124 Nonsense_Mutation 117 RNA 110 Frame_Shift_Del 67 Splice_Site 54 In_Frame_Ins 51 In_Frame_Del 11 3'UTR 1 Intron 1 Name: count, dtype: int64
sample_types1 = TCGA_LAML['sample'].value_counts()
sample_types1
sample TCGA-AB-3009-03 51 TCGA-AB-3002-03 36 TCGA-AB-2807-03 35 TCGA-AB-2927-03 30 TCGA-AB-2959-03 29 .. TCGA-AB-2834-03 1 TCGA-AB-2823-03 1 TCGA-AB-2883-03 1 TCGA-AB-2909-03 1 TCGA-AB-2954-03 1 Name: count, Length: 197, dtype: int64
gene_types1 = TCGA_LAML['gene'].value_counts()
gene_types1
gene DNMT3A 57 FLT3 56 NPM1 55 TET2 28 RUNX1 21 .. CTCF 1 LOC100130734 1 LOC100132903 1 CILP2 1 RPL32 1 Name: count, Length: 1890, dtype: int64
gene_types_top25 = gene_types1.head(25)
plt.figure(figsize = (10,8))
gene_types_top25.plot(kind = 'barh', color = 'pink', edgecolor = 'black')
plt.xlabel("Number of Mutations")
plt.ylabel("Gene")
plt.title("Top 25 Most Frequently Mutated Genes in TCGA Acute Lymphoblastic Leukemia")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()