# First we need to load the necessary modules IF we are on Vera
# Doubble check this for next year!
%%bash
module load scipy/1.13.1
module load Seaborn/0.13.2-gfbf-2024a
module load  matplotlib/3.9.2

# 1. Integers (int) - Whole numbers
gene_count = 42
chromosome_number = 23

# Let's see what happens when we do operations with integers
print(f"Number of genes: {gene_count}")
print(f"If we sequence twice as many genes: {gene_count * 2}")

Number of genes: 42
If we sequence twice as many genes: 84

# 2. Floating Point Numbers (float) - Numbers with decimals
gene_expression = 1.45
p_value = 0.001
fold_change = 2.5

# Common operations in expression analysis
print(f"Expression value: {gene_expression}")
print(f"Expression after fold change: {gene_expression * fold_change}")
print(f"Is this p-value significant? {p_value < 0.05}")

Expression value: 1.45
Expression after fold change: 3.625
Is this p-value significant? True

# 3. Strings (str) - Text data
gene_name = "BRCA1"
dna_sequence = "ATGCGT"

# String operations that are useful in bioinformatics
print(f"Gene name: {gene_name}")
print(f"Gene name in lowercase: {gene_name.lower()}")
print(f"DNA sequence length: {len(dna_sequence)}")
print(f"Number of G bases: {dna_sequence.count('G')}")

Gene name: BRCA1
Gene name in lowercase: brca1
DNA sequence length: 6
Number of G bases: 2

# Type conversion 
sequence_length = len(dna_sequence)  # This is an integer
gc_content = (dna_sequence.count('G') + dna_sequence.count('C')) / len(dna_sequence)  # This is a float
print(f"GC content: {gc_content:.2f}")  # .2f formats to 2 decimal places

GC content: 0.50

# Example 1: Working with p-values from a differential expression analysis
p_values = [0.042, 0.127, 0.031, 0.982, 0.001, 0.067, 0.015]
significant_count = 0

# Count how many p-values are significant (< 0.05)
for p_value in p_values:
    if p_value < 0.05:
        significant_count += 1
        
print(f"Number of significant p-values: {significant_count}")

Number of significant p-values: 4

# We can do the same thing more elegantly with a list comprehension
significant_count = len([p*2 for p in p_values if p < 0.05])
print(f"Number of significant p-values (using list comprehension): {significant_count}")

Number of significant p-values (using list comprehension): 4

# Example 2: Checking if a gene is present in our dataset
antibiotic_resistance_genes = ["gyrA", "parC", "qnrS", "tetM", "blaTEM"]
gene_of_interest = "gyrA"

if gene_of_interest in antibiotic_resistance_genes:
    print(f"{gene_of_interest} is in our list of resistance genes")
else:
    print(f"{gene_of_interest} is not in our list of resistance genes")

gyrA is in our list of resistance genes

# Let's try another gene
gene_of_interest = "mecA"
if gene_of_interest in antibiotic_resistance_genes:
    print(f"{gene_of_interest} is in our list of resistance genes")
else:
    print(f"{gene_of_interest} is not in our list of resistance genes")

mecA is not in our list of resistance genes

# Example 3: Working with gene names and their expression values together
gene_names = ["gyrA", "parC", "qnrS", "tetM"]
expression_values = [1.5, 0.8, 2.1, 0.3]

# Print genes and their expression values
for gene, expression in zip(gene_names, expression_values):
    print(f"Gene {gene} has expression value: {expression}")

Gene gyrA has expression value: 1.5
Gene parC has expression value: 0.8
Gene qnrS has expression value: 2.1
Gene tetM has expression value: 0.3

# Find highly expressed genes (expression > 1.0)
highly_expressed = [gene for gene, expr in zip(gene_names, expression_values) if expr > 1.0]
print(f"\nHighly expressed genes: {highly_expressed}")

Highly expressed genes: ['gyrA', 'qnrS']

# Create a list of p-values
p_values = [0.001, 0.043, 0.028, 0.015, 0.062, 0.008, 0.051, 0.037]

# Get single elements
print(f"First p-value (index 0): {p_values[0]}")
print(f"Fifth p-value (index 4): {p_values[4]}")

# Get a slice (elements 3-5)
# Syntax is list[start:end] - note that end index is not included
print(f"P-values from position 3 to 5: {p_values[3:6]}")

# Get last element
print(f"Last p-value: {p_values[-1]}")

# Get elements from start to position 3
print(f"First four p-values: {p_values[:4]}")

# Get elements from position 5 to end
print(f"All p-values from position 5 onwards: {p_values[5:]}")

First p-value (index 0): 0.001
Fifth p-value (index 4): 0.062
P-values from position 3 to 5: [0.015, 0.062, 0.008]
Last p-value: 0.037
First four p-values: [0.001, 0.043, 0.028, 0.015]
All p-values from position 5 onwards: [0.008, 0.051, 0.037]

# Creating a dictionary mapping gene IDs to their expression values
gene_expression = {
    'BRCA1': 2.5,
    'TP53': 1.8,
    'EGFR': 3.2,
    'KRAS': 0.7
}

# Accessing values using keys
print(f"BRCA1 expression level: {gene_expression['BRCA1']}")
# Adding new entries
gene_expression['PTEN'] = 1.4

gene_expression

BRCA1 expression level: 2.5

{'BRCA1': 2.5, 'TP53': 1.8, 'EGFR': 3.2, 'KRAS': 0.7, 'PTEN': 1.4}

# Creating a more complex dictionary with multiple data points per gene
gene_data = {
    'BRCA1': {
        'expression': 2.5,
        'p_value': 0.001,
        'chromosome': 17
    },
    'TP53': {
        'expression': 1.8,
        'p_value': 0.003,
        'chromosome': 17
    }
}

# Accessing nested data
print(f"\nTP53 data:")
print(f"Expression: {gene_data['TP53']['expression']}")
print(f"Located on chromosome: {gene_data['TP53']['chromosome']}")

TP53 data:
Expression: 1.8
Located on chromosome: 17

# More complex example: Dictionary with multiple lists per gene
gene_data_series = {
    'BRCA1': {
        'expression': [1.2, 2.4, 1.8, 2.1],
        'p_values': [0.001, 0.002, 0.001, 0.005],
        'sample_ids': ['A1', 'A2', 'A3', 'A4']
    },
    'TP53': {
        'expression': [0.8, 1.1, 3.2, 2.8],
        'p_values': [0.003, 0.001, 0.002, 0.001],
        'sample_ids': ['A1', 'A2', 'A3', 'A4']
    }
}

# Analyzing the complex data structure
print("\nDetailed analysis of TP53:")
print("Expression values:", gene_data_series['TP53']['expression'])
print("Corresponding p-values:", gene_data_series['TP53']['p_values'])
print("Samples:", gene_data_series['TP53']['sample_ids'])

Detailed analysis of TP53:
Expression values: [0.8, 1.1, 3.2, 2.8]
Corresponding p-values: [0.003, 0.001, 0.002, 0.001]
Samples: ['A1', 'A2', 'A3', 'A4']

# Practical example: Counting mutations in different genes
mutation_counts = {}
sequenced_genes = ['TP53', 'BRCA1', 'TP53', 'KRAS', 'BRCA1', 'TP53']

for gene in sequenced_genes:
    if gene in mutation_counts:
        mutation_counts[gene] += 1
    else:
        mutation_counts[gene] = 1

print("\nMutation frequencies:")
for gene, count in mutation_counts.items():
    print(f"{gene}: {count} mutations")

Mutation frequencies:
TP53: 3 mutations
BRCA1: 2 mutations
KRAS: 1 mutations

import pandas as pd

# Create a simple DataFrame from our expression data
expression_df = pd.DataFrame({
    'gene_name': ['BRCA1', 'BRCA1', 'BRCA1', 'BRCA1', 'BRCA1',
                  'TP53', 'TP53', 'TP53', 'TP53', 'TP53',
                  'EGFR', 'EGFR', 'EGFR', 'EGFR', 'EGFR'],
    'sample_site': ['brain', 'liver', 'lung', 'kidney', 'heart'] * 3,
    'expression': [245.6, 178.9, 334.2, 198.7, 267.3,
                  678.9, 589.4, 723.1, 645.8, 701.2,
                  1456.2, 1234.5, 1567.8, 1345.6, 1489.2],
    'p_value': [0.001, 0.003, 0.0005, 0.002, 0.001,
                0.04, 0.02, 0.03, 0.01, 0.02,
                0.0001, 0.0003, 0.0002, 0.0001, 0.0004],
    'fold_change': [2.5, 1.8, 3.2, 2.1, 2.7,
                    0.3, 0.4, 0.2, 0.5, 0.3,
                    4.2, 3.8, 4.5, 4.0, 4.3],
    'concentration_g': [10000, 8500, 12000, 9500, 11000,
                       2045, 1890, 2200, 1950, 2100,
                       15, 12, 18, 14, 16]
})

# Set the gene_name column as the index
expression_df.set_index('gene_name', inplace=True)
expression_df

# Select a single column (all control values)
p_values = expression_df['p_value']
p_values

gene_name
BRCA1    0.0010
BRCA1    0.0030
BRCA1    0.0005
BRCA1    0.0020
BRCA1    0.0010
TP53     0.0400
TP53     0.0200
TP53     0.0300
TP53     0.0100
TP53     0.0200
EGFR     0.0001
EGFR     0.0003
EGFR     0.0002
EGFR     0.0001
EGFR     0.0004
Name: p_value, dtype: float64

# Select multiple columns
condition_data = expression_df[['p_value', 'fold_change']]
condition_data

# Select a specific gene's data
brca1_data = expression_df.loc['BRCA1']
brca1_data

# Perform calculation on columns
expression_df["concentration_Kg"] = expression_df.concentration_g / 1000
expression_df

# Count the number of significant genes in the kidney with fold_change > 2 
high_expression = expression_df[(expression_df['p_value'] <= 0.05) & 
                                (expression_df['fold_change'] > 2) &
                                (expression_df['sample_site'] == "kidney")].index.nunique()
high_expression

2

import numpy as np
#from scipy import stats

# Set random seed for reproducibility
np.random.seed(42)

# Create DataFrame with expression data
expression_data = pd.DataFrame({
    'expression_before': [98.2, 103.4, 86.5, 92.8, 107.2, 95.6, 96.7, 101.3, 89.4, 
                         91.2, 94.5, 102.8, 97.6, 95.4, 88.3, 93.2, 99.1, 96.8, 
                         104.5, 98.7, 91.4, 94.6, 96.2, 97.8, 95.5, 93.4, 97.1, 
                         99.3, 92.6, 95.8],
    'expression_after':  [175.4, 182.3, 168.7, 188.9, 177.2, 165.8, 179.4, 173.2, 
                         181.5, 174.6, 169.8, 183.4, 176.7, 172.3, 178.9, 184.5, 
                         171.2, 177.8, 182.1, 175.6, 179.8, 173.4, 176.5, 180.2, 
                         174.3, 178.7, 181.9, 175.1, 177.6, 182.4]
})

# Add sample IDs as index
expression_data.index = [f'sample_{i+1}' for i in range(len(expression_data))]

# Preview the data
print("First few rows of our expression data:")
print(expression_data.head())

First few rows of our expression data:
          expression_before  expression_after
sample_1               98.2             175.4
sample_2              103.4             182.3
sample_3               86.5             168.7
sample_4               92.8             188.9
sample_5              107.2             177.2

# Calculate mean, standard deviation, and standard error for both conditions
summary_stats = pd.DataFrame({
    'mean': [expression_data['expression_before'].______, 
            expression_data['expression_after'].______],
    'std': [expression_data['expression_before'].______, 
           expression_data['expression_after'].______],
    'sem': [expression_data['expression_before'].std() / np.sqrt(______), 
           expression_data['expression_after'].std() / np.sqrt(______)]
}, index=['Before', 'After'])

print("Summary Statistics:")
print(summary_stats)

# Create column called "fold_change" based on  expression_after / expression_before

# Calculate the median cold change and print that value

# Perform paired t-test
t_stat, p_value = stats.ttest_rel(
    expression_data['_______'],
    expression_data['_______'] 
)

print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4e}")

import matplotlib.pyplot as plt
import seaborn as sns

# Convert data from wide to long format
expression_data_long = pd.melt(
    expression_data, 
    value_vars=['expression_before', 'expression_after'],
    var_name='condition',
    value_name='expression'
)

# Create the plot
plt.figure(figsize=(8, 6))
sns.boxplot(data=expression_data_long, x='_______', y='_______')
sns.despine()
plt.title('BRCA1 Expression Before and After Treatment')
plt.ylabel('Expression Level')
plt.tight_layout()
plt.show()

def analyze_list(expression_list):
    """
    Analyzes a list of expression values.
    Returns: (median_expression, num_high_expression)
    """
    # Calculate median
    median = np.median(expression_list)
    
    # Count genes with expression > 150
    high_expr_count = sum(1 for expr in expression_list if expr > 150)
    
    return median, high_expr_count

def analyze_dataframe(df):
    """
    Analyzes a dataframe with an 'expression' column.
    Returns: (median_expression, num_high_expression)
    """
    # Calculate median
    median = df['expression'].median()
    
    # Count genes with expression > 150
    high_expr_count = len(df[df['expression'] > 150])
    
    return median, high_expr_count

list_timeit = [round(np.random.normal(200, 15)) for _ in range(100_000)]

df_timeit = pd.DataFrame({
    'sample_name': "Sea_water",
    'expression': list_timeit
})

%%timeit

analyze_list(list_timeit)

4.75 ms ± 70.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

%%timeit

analyze_dataframe(df_timeit)

1.37 ms ± 24.9 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

The basic data types¶

Lists and for loops¶

Dictionaries¶

DataFrames and Pandas¶

Structure¶

Long vs wide data frames¶

Basic Operations¶

Accessing Data¶

Python Operators for Data Filtering¶

Your Task¶

2. Calculate statistics Fold Change¶

3. Perform Statistical Analysis¶

4. Plot the data¶

Why Pandas is Fast: A Bioinformatics Perspective¶

Performance Comparison Example¶

Why Pandas is Faster¶

Vectorized Operations¶

Optimized C Backend¶

Memory Efficiency¶

Questions¶

	sample_site	expression	p_value	fold_change	concentration_g
gene_name
BRCA1	brain	245.6	0.0010	2.5	10000
BRCA1	liver	178.9	0.0030	1.8	8500
BRCA1	lung	334.2	0.0005	3.2	12000
BRCA1	kidney	198.7	0.0020	2.1	9500
BRCA1	heart	267.3	0.0010	2.7	11000
TP53	brain	678.9	0.0400	0.3	2045
TP53	liver	589.4	0.0200	0.4	1890
TP53	lung	723.1	0.0300	0.2	2200
TP53	kidney	645.8	0.0100	0.5	1950
TP53	heart	701.2	0.0200	0.3	2100
EGFR	brain	1456.2	0.0001	4.2	15
EGFR	liver	1234.5	0.0003	3.8	12
EGFR	lung	1567.8	0.0002	4.5	18
EGFR	kidney	1345.6	0.0001	4.0	14
EGFR	heart	1489.2	0.0004	4.3	16