%%script false --no-raise-error

# using help()
import pandas as pd # this is how Pandas is commonly imported
help(pd.DataFrame) # DataFrame is a specific function of the Pandas module

# using ?
pd.Series?

# import modules
import pandas as pd # enables manipulation of dataframes in Python
import numpy as np # enables manipulation of arrays in Python
import random # fetches random values
from urllib.request import urlretrieve # enables the downloading of files from the internet via URL
import gzip # module to enable I/O of gzipped files
import shutil # module to perform operations on files
from Bio import SeqIO # import the SeqIO Biopython module for working with sequence data
import matplotlib.pyplot as plt # import the matplotlib graphics module
import seaborn as sns # import the seaborn graphics module
import warnings # module to manage warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # prevent Python from warning of future feature deprecation

# string
test_string = "test string"
print('The content of variable test_string is the string ' + test_string)
print(type(test_string))

# integer
test_int = 3
print('The content of variable test_int is the string ' + str(test_int))
# note that when numbers are concatenated with strings in a print statement, the number has to converted to a string with str()
print(test_int)
print(type(test_int))

# float
test_float = 3.14159
print("The content of variable test_float is the string " + str(test_float))
# Note that while Python will return strings in double quotes, you can provide strings to Python in single quotes
type(test_float)

The content of variable test_string is the string test string
<class 'str'>
The content of variable test_int is the string 3
3
<class 'int'>
The content of variable test_float is the string 3.14159

float

# list
test_list = ["test", "list", "example"]
print(test_list)
print(type(test_list))

['test', 'list', 'example']
<class 'list'>

# tuple
test_tuple = ("test", "tuple", "example")
print(test_tuple)
print(type(test_tuple))
# indexing example
print('The second element of the tuple is the word "' + test_tuple[1] + '".')

('test', 'tuple', 'example')
<class 'tuple'>
The second element of the tuple is the word "tuple".

# set
test_set = {"test", "set", "example"}
print(test_set)
# note how this was likely printed to your screen in a different order from how the object were originally given to the set
print(type(test_set))

{'example', 'set', 'test'}
<class 'set'>

# dictionary
test_dict = {"key1": "value1",
             "key2": "value2",
             "key3": "value3"}
print(test_dict)
print(type(test_dict))

# reference a specific value in a dictionary with DICTIONARY_NAME[KEY]
print('The value associated with key2 is "' + test_dict["key2"] + '".')

{'key1': 'value1', 'key2': 'value2', 'key3': 'value3'}
<class 'dict'>
The value associated with key2 is "value2".

k = random.randint(-10, 10) # return a random integer between -10 and 10

if k > 0:
    # see if the value of k is >0
    print("The value of k is positive.")
elif k < 0:
    # see if the value of k is <0
    print("The value of k is negative.")
else: 
    # if neither the primary nor secondary conditions are fulfilled
    # i.e., if k==0
    print("The value of k is 0.")

The value of k is negative.

# while loop

# set the iterator j to 0
j = 0

while j <= 7:
    print(j)
    j += 1 
    # add +1 to the value of j with each iteration of the loop
    # the looping will stop once j == 8

# for loop

# looping over a list

# list of accession numbers for a few RAS proteins
# source: https://www.ebi.ac.uk/interpro/entry/pfam/PF00071/protein/UniProt/#table
ras_list = ["A0A010Q0G0", "A0A010Q6W4", "A0A010Q7K6", "A0A010QKF9", "A0A010QSB8"]

for ras_prot in ras_list: 
    # loop over the elements of ras_list
    if ras_prot.endswith("8"): 
        # find a RAS protein accession ending with the number 8
        # and print the results
        print("The RAS protein accesion number " + ras_prot + " ends with the number 8.")


# looping over a dictionary

# dictionary of bacterial species and associated phyla
bacteria_dict = {"Gardnerella vaginalis": "Actinobacteria",
                "Lactobacillus crispatus": "Firmicutes",
                "Lactobacillus iners": "Firmicutes"}

# loop over the dictionary to find which species are Firmicutes
for key, value in bacteria_dict.items():
    # iterate over the items in a dictionary
    # using this method may be unnecessary here, but it allows the flexibility of accessing both keys & values
    if bacteria_dict[key] == "Firmicutes":
        # identify dictionary elements where the value is Firmicutes
        print(key + " is a Firmicute (Bacillota).")

The RAS protein accesion number A0A010QSB8 ends with the number 8.
Lactobacillus crispatus is a Firmicute (Bacillota).
Lactobacillus iners is a Firmicute (Bacillota).

# save the file name to a variable
iris_db = "iris.csv"

# import the iris dataset into a Pandas dataframe
iris_df = pd.read_csv(iris_db, sep=',', header=0)

print(iris_df.columns)

# print the dataframe
iris_df

Index(['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Class'], dtype='object')

iris_df.loc[iris_df['Sepal_Length'] >= 5]
# note that this doesn't save the filtered dataframe, only displays it

# get the log10 values of sepal length
print(np.log10(iris_df['Sepal_Length']))
# Pandas has inbuilt methods for multiplication mul(), division .div(), addition .add() & substraction .sub()
# but for more complicated math, numpy can help
print(iris_df['Sepal_Length'].add(3))

0      0.707570
1      0.690196
2      0.672098
3      0.662758
4      0.698970
         ...   
145    0.826075
146    0.799341
147    0.812913
148    0.792392
149    0.770852
Name: Sepal_Length, Length: 150, dtype: float64
0      8.1
1      7.9
2      7.7
3      7.6
4      8.0
      ... 
145    9.7
146    9.3
147    9.5
148    9.2
149    8.9
Name: Sepal_Length, Length: 150, dtype: float64

# manipulate text in a column
iris_df['Class'] = iris_df['Class'].str.replace('Iris-', '')
# above, removing the Iris- substring from the Class column and saving the edit to the dataframe
iris_df

# divide two columns by each other
# once again, we use Numpy to accomplish this
iris_df['Petal_WdivD'] = iris_df['Petal_Length']/iris_df['Petal_Width']
# round the new column to 3 decimal places
iris_df['Petal_WdivD'] = iris_df['Petal_WdivD'].round(3)

# now see the new dataframe
iris_df

# using groupby

# group the data according to the type of iris and get the means of each type of data for that iris type
iris_df.groupby(['Class']).mean()

# lambda function

# create a Pandas dataframe with lists in one of the columns
d = {'sequence_ids': ["seq_1", "seq_2"], 'sequence_start': ["ACTGTGTG", "ATGTGTG"], 'functional_profile': [["protein folding", "chaperone"], ["immune", "autoimmune", "white blood cells"]]}
df = pd.DataFrame(data=d)
print(df)

# expand the lists into comma-separated strings using a lambda function
df['functional_profile'] = df['functional_profile'].apply(lambda x: ', '.join(map(str, x)))
print(df)

  sequence_ids sequence_start                       functional_profile
0        seq_1       ACTGTGTG             [protein folding, chaperone]
1        seq_2        ATGTGTG  [immune, autoimmune, white blood cells]
  sequence_ids sequence_start                     functional_profile
0        seq_1       ACTGTGTG             protein folding, chaperone
1        seq_2        ATGTGTG  immune, autoimmune, white blood cells

# for more detailed information on downloading files from the internet with Python, see: 
# https://realpython.com/python-download-file-from-url/

# set necessary variables
url = ("http://sgd-archive.yeastgenome.org/sequence/S288C_reference/orf_dna/orf_genomic_all.fasta.gz")
filename = "orf_trans.fasta.gz"

# download file
urlretrieve(url, filename)

('orf_trans.fasta.gz', <http.client.HTTPMessage at 0x7f79141ea950>)

# open the FASTA file & extract the data

# we need to start by decompressing the gzipped file because Biopython does not play well with gzipped files
# ref: https://github.com/biopython/biopython/issues/1686
# we can do this with the gzip module
# ref: https://stackoverflow.com/questions/31028815/how-to-unzip-gz-file-using-python
with gzip.open(filename, 'rt') as f_in:
    # open the gzipped file for reading
    # create the output filename based on the input filename
    new_file = filename.removesuffix('.gz')
    with open(new_file, 'wt') as f_out:
        # now open the new file for writing
        # and write out the contents
        shutil.copyfileobj(f_in, f_out)

# now create a dictionary 
# ref: https://stackoverflow.com/questions/29333077/reading-a-fasta-file-format-into-python-dictionary
yeast_seq_dict = {rec.id : rec.seq for rec in SeqIO.parse(new_file, "fasta")}

# print the first key and value from the dictionary: a header & sequence
# get the first key
yeast_key_1 = list(yeast_seq_dict.keys())[0]
print(yeast_key_1)
print(yeast_seq_dict[yeast_key_1])

YAL069W
ATGATCGTAAATAACACACACGTGCTTACCCTACCACTTTATACCACCACCACATGCCATACTCACCCTCACTTGTATACTGATTTTACGTACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTCTCAGATTCCACTTCACTCCATGGCCCATCTCTCACTGAATCAGTACCAAATGCACTCACATCATTATGCACGGCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACCCATAACGCCCATCATTATCCACATTTTGATATCTATATCTCATTCGGCGGTCCCAAATATTGTATAA

%%script false --no-raise-error

'''
For the sake of introducing Biopython, we created a sequence dictionary using the Bio.SeqIO module in the code above. 

However, it's worth noting that you can create this type of dictionary on your own, as well. 
In this code block, which will not execute by default, I demonstrate how that can be done. 
'''

# create an empty dictionary to fill with sequence information
seq_dict = {}

# iterate over the gzipped file's contents to extract information
with gzip.open(filename, "rt") as handle:
    # open the .gzip file for reading
    # create an empty list for the sequence information
    seq_list = []
    for line in handle: 
        # iterate through the file line by line
        if line.startswith(">"): 
            # identify the sequence header lines
            # first add the previous header & sequence to the dictionary
            if seq_list:
                # if list block contains sequence portions
                # concatenate the list into a single sequence
                seq_dict[header] = ''.join(seq_list)
                # empty the list
                seq_list = []
            header_full = line.strip()[1:]
            # remove the ">" character with [1:] and the end-line character with .strip()
            header = header_full.split()[0]
            # save only the sequence identifier to be the new header
        else: 
            # for the sequence lines
            seq_list.append(line.strip())
    if seq_list:
        # for the last sequence line
        # need to add the dictionary contents outside the main loop
        seq_dict[header] = ''.join(seq_list)

# reference for working with multi-line fastas
# https://stackoverflow.com/questions/50856538/how-to-convert-multiline-fasta-files-to-singleline-fasta-files-without-biopython

# now print the first key:value pair
key_1 = list(seq_dict.keys())[0]
print(key_1)
print(seq_dict[key_1])

# reverse complements

with open(new_file, "r") as infile: 
    # open the fasta file for reading
    counter = 0 
    # here I will use a counter to prevent Python from printing every single result
    for record in SeqIO.parse(infile, "fasta"):
        # for each of the records in the fasta file
        counter += 1
        # add 1 to the counter for each record
        if counter <= 10:
        # only print the first 10 records
            # use SeqIO to find the complement sequences
            print( "For " + record.id + " the reverse complement sequence is '" + str(record.seq.complement()) + "'." )

For YAL069W the reverse complement sequence is 'TACTAGCATTTATTGTGTGTGCACGAATGGGATGGTGAAATATGGTGGTGGTGTACGGTATGAGTGGGAGTGAACATATGACTAAAATGCATGCGTGTGCCTACGATGTCATATATGGTAGAGTTTGAATGGGATGAGAGTCTAAGGTGAAGTGAGGTACCGGGTAGAGAGTGACTTAGTCATGGTTTACGTGAGTGTAGTAATACGTGCCGTGAACGGAGTCGCCAGATATGGGACACGGTAAATGGGTATTGCGGGTAGTAATAGGTGTAAAACTATAGATATAGAGTAAGCCGCCAGGGTTTATAACATATT'.
For YAL068W-A the reverse complement sequence is 'TACGTGCCGTGAACGGAGTCGCCAGATATGGGACACGGTAAATGGGTATTGCGGGTAGTAATAGGTGTAAAACTATAGATATAGAGTAAGCCGCCAGGGTTTATAACATATTGACGGGAATTATGTATGCAATATGGTGAAAACGTGGTATATGAATGGTGAGGTAAATATATGTGAATACAGTTATAATGTCTTTTTAGGGGTGTTTTTAGTGGATTTGTATTTTTATAAGATGAAAAGTTGTTATTATGTATT'.
For YAL068C the reverse complement sequence is 'TACCAGTTTAATTGAAGTTAGCGGCGACCACAGCGACGGTAGCGACGATGACGAAGACGTTGGTGGTGAGATCGAGTTAGACTGCTTTCTCAGTTGAACCACCTTAACCCACAGATGCAGAGACTATAGTCTCGAGTGAATCGGGTTATGATGTACAAGGTTCGGCGGGTGGGTTGACTTTGGATGGGTCAGCTTCAACGACTTCGGCAAAAGTTGATGCCACTGAAGTGGTGGTACAACTGGCCATAACGAGGTCTGGTTCACTGGTCTTACTAGTGGCCACAAGGTACCATGAGGTCGTCTAATTTCGGTCGGTAGAGGTCACGAGATAGGTTCCTGCCATAGATGTGATAGCGTTTGATC'.
For YAL067W-A the reverse complement sequence is 'TACGGTTAATATCCCCACGGCTCCACGGAATATTTTGGGAAAAGACACGGACACTGTAAAGGAAAAAGCCAGTTTTTCTTATAGGCTTAAAATCTAAACCTGGGAGCATGTCTTCGAATAACAGATTCGGACTTAAGTCAGACGAAATTTGCCGAAGGCGCCTCCTTTATAAAGGTAGAGAACTTAAGCATGTTGTAATTTGCACACAACCCTCAGCATATGACAATC'.
For YAL067C the reverse complement sequence is 'TACATAAGTTAACAATTTCTCTAATAACATCTAGGAATGTTTTCTGATTTTACCCCAAAATAAGGTCATTTCGCCGTCCACCTTCTGGACGGTCTACTGAATTTAAGTTGTCTTTAACAGTGATAGAGGTTGTCATAGGTCTCAGTACTTTGTCGACTTTTAAAGTAGTGCTGATGTTCACTTTTTCTAGTTGATGTAAAACTCTGATCATCGATATCACTTGTATTTCTGTTACACTTGCAATGATCTTCAATACTTATATCTCTACTTCGGCTATCCGGTACCACCTCTAAAAAGCTACTTGTTCTCATAGCCTAGTTACTTTTCCTTTCTAGAGTGTTATTTACCATATCAACCAAGTTTGTCCCATGGAGAAAGTTTCTTTTTTTTAATAATTAATTTAACCTACAGGAAAATCGGAAAATAAGAACATAACGAATAACCCACTTTATAGACCTATGCCAATTATATTTGTTGCGAATGCAAAGCCCTTACTTCCTTCTAAATCCGAAAGTTCCATTACTAAACCACGTATGAGTTCATTACATGTGTCAACCATTATAATATAAAGTTAACGGTAAAAACTAAATGGACTTGTTCGAGGGTAATTTGATACAAAATGGTTCGGAGCTGAATACAACCAGCGAAAATTGGCAACCACGACGTATACAGTTAAGACATGGTGTGAACTTTCGTTAATCCAAAAAGTAACCCCGAAAACTTCGCGGTTCAATAAACCGTATGGTTATAAACAAACCAAGGAAAATGTTTGTACTACTTTACCACGCAGCAAGACGAAAAATGATAAACCCGGTCATATAGCCATAAGATAGGCGACCACCCTAGGTCAGTCGGCATATAAGTAGCAATTTACCACATTTACCAAATCTCCCTACCTCTACCTTGAAATAATAACTGCGATAACACAGACAGCATCACCCGGAATAACCTAAAATGAGGGACGGTCCACTGGGTATGTTGACAATAAGATAAAAGAATTGACTACTACTTTAATCCAACCGCTCCTTTTCTAATTTTCTTTTGGTTTGTCCATTTTCACTAAAACTTTGTTTTCATAAGCTATAATTTGACACCTTTTGTTAAAAGTCACTAACCTTTTATATGTAAAATTGAAATACCTTATAAAAGACAACCTTACTGTCATTACAAAGTAGACCCCGTATGGATGATACCAACTTTAGAAACTTTTCTATGAGATAAGGATTCGAGTTAGTCAATAGGTACTAATGAGGCCCAAATCCATACCAAATAAACTACGAATGACCATAATAACGTCTATTTAATGTGAGAGCAACCAAACGCTAATAAAAATGAGTCCAAAAGTTATAGTAACCATTGAGGTATAATCGGCGAACCCTGCAGCGTCTTCCTCGGTTTACCAAACGTAAATACGACGTTACAAAACCAACCCGATACCGAGGACAAAATATGAGAACCGTTTTGCTATAAACAGCGGCTCTACGAGTTTGATCTCGATAATGAAATCAATGTTACTTATAATACCGAGTTAGTAGATGGCGTACCTATTCACAAAACCAAACCTTTTGTCTTCTTCGAGGGTCCATAAATTTCCCCAAATGAAAGTGACGTACAAGACGAAAAACAGAGAGGTAAACCTGAAAACAACATGAGATGAAGATATTTGCACTACTTTCCTTGTTACGGTTCTTCTTGCCATAACACGAAATATTGAGATTTGTACCACACCTTTTCGGTTGCAGTTTTCTGCAACTTTGGAATAGTCATAGACTACTTTTTATT'.
For YAL066W the reverse complement sequence is 'TACAATAGAGAACATTTTTCTTCATAAGAAGTAAGTTATGGTTAATGAGCAGTGTAAGAAGGTTAGGTTAATTATAACCAATTTTACTTGGTACACGTTTAGTCTTTGTATTTTAATATAGTGAAATAAAGTATACCAAAGTACGAATGTTTCGAATGACAGAAAGAGAAATTGAATAAAAAGATGTCCGATGCTTAAGAAACGTCCGAATGAAATGAGTATAATAGTAATGGACATGTTTATATATAATTTCTTTAGGTTTGTTTTTACGAACTTTTCGTATGTCGAAGGCTATGTAGTACATATATC'.
For YAL065C the reverse complement sequence is 'TACTTGTCACGATGGTCACTCTGTTGTTGGTTATGACCTCGACGACTCTGTTGATGGTCATGACCTCGACGACTCTGCTTTTGTCATCAGTGGAGAAGTTAAAGTTCTAAGTTAGTACGACTTTGTGTCTGCCGAAGGCGCTGGCTACACTAACCAGTGTCGTCATCACAACAAAGACATAGGCTTTGACCGTTGTGGTTCTCAGATTATTGAAGGCCCAATTCATGATACAGCGTTGTCGGAGCATCGTGTGGTCGTTCATCGTATTATCCTAGATCATGACGGAGAAATCTTTAGAGTTGGATGCAACCATAACGGTTACCAGACAACTGGTTATTACCGTATTCACAAAAATAAAGGTGGCATAACGACCGTTAGCATACCATT'.
For YAL064W-B the reverse complement sequence is 'TACCGTCCACTTCGTCAAAGCCTTGTGTGTGGTCTAAGCGTCCTTCATTGTCATTGATCGCATCAAACAACGGAGCTAAGACACCACCTTTATCCTGTGGTACAGCACATAAGACACCATTGCGGCAATTATCATCGTCACGAATATCTATGTTACCGACCTCTTCGTCAGAATCTCGTATGCAGTCTAAGTGTTCTTCTTTAACATTGATGACACCAAACGAGACAACAGGGGAACCAAACGAAACAACAGAGGCATCAAACGAAACAATAGAGACACCATCTTTATCCCGTGGTACACCATATGAGACAACACCGTGGCGATTGTCATTGCCACCGGCACCTTTGTTAACGTCTCCTCTACCTAAGTCACGTGTGTACT'.
For YAL064C-A the reverse complement sequence is 'TACTGTCCAATGAAAAATGGTGGTGTTTGTTCAAGAATGTGCAAGTCCAAACGATTCCAGCTACTGAGACGTTAAGATAGTCAGCCACCGCTGCAACGTAAGCCTACGACACGTGTTCTCGTTGGAGGTTAATGTAGGTGTTTGAAATGCTAATTACCATAGTTCGGTACCGTTCCTTCTAACGGGCTATTGTAACGTCCCTGACACATATACATACGACCTAAGATGACAGGTTACTTCTAACAGATGAGTTTGCGACAAAGAACCGTATGTGACGGTCAATCGCACCTTAACGGACTACAATGATGACAATCGCTACTAAAACGTCCCGTGCAAATGAGAAAACTGCTACTGGATTGTCGGGTTAACATGATAGGGACT'.
For YAL064W the reverse complement sequence is 'TACTTAGGAAAACGTAGCGATCTTCCTGTTCTATTATAAAGAAGACAAAAAAAAAATGTATACGTTGTTAAACTTTCAGTCCAGTTTCTGTCTAAGGCCAAGGGGTATAAGTCTAACCTCTCTTTTTGAAAACCGTTGAGTACAATGGTTCAGCTCTGCGAATTTCAATTCACAGCCGGTTCTGTGCGTTTTAGAACATTAGAAAATTGCGACGAGAAGTTTAGTGCCTGCGTTAGTCATGAACATGGATTAAAACCAAAAGATTATAACTTATCGCTTGGTATC'.

# translate the ORF sequences

with open(new_file, "r") as infile: 
    # open the fasta file for reading
    counter = 0 
    # here I will use a counter to prevent Python from printing every single result
    for record in SeqIO.parse(infile, "fasta"):
        # for each of the records in the fasta file
        counter += 1
        # add 1 to the counter for each record
        if counter <= 10:
        # only print the first 10 records
            # use SeqIO to find the complement sequences
            print( "For " + record.id + " the reverse complement sequence is '" + str(record.seq.translate()) + "'." )

For YAL069W the reverse complement sequence is 'MIVNNTHVLTLPLYTTTTCHTHPHLYTDFTYAHGCYSIYHLKLTLLSDSTSLHGPSLTESVPNALTSLCTALASAVYTLCHLPITPIIIHILISISHSAVPNIV*'.
For YAL068W-A the reverse complement sequence is 'MHGTCLSGLYPVPFTHNAHHYPHFDIYISFGGPKYCITALNTYVIPLLHHILTTPFIYTYVNITEKSPQKSPKHKNILLFNNNT*'.
For YAL068C the reverse complement sequence is 'MVKLTSIAAGVAAIAATASATTTLAQSDERVNLVELGVYVSDIRAHLAQYYMFQAAHPTETYPVEVAEAVFNYGDFTTMLTGIAPDQVTRMITGVPWYSSRLKPAISSALSKDGIYTIAN*'.
For YAL067W-A the reverse complement sequence is 'MPIIGVPRCLIKPFSVPVTFPFSVKKNIRILDLDPRTEAYCLSLNSVCFKRLPRRKYFHLLNSYNIKRVLGVVYC*'.
For YAL067C the reverse complement sequence is 'MYSIVKEIIVDPYKRLKWGFIPVKRQVEDLPDDLNSTEIVTISNSIQSHETAENFITTTSEKDQLHFETSSYSEHKDNVNVTRSYEYRDEADRPWWRFFDEQEYRINEKERSHNKWYSWFKQGTSFKEKKLLIKLDVLLAFYSCIAYWVKYLDTVNINNAYVSGMKEDLGFQGNDLVHTQVMYTVGNIIFQLPFLIYLNKLPLNYVLPSLDLCWSLLTVGAAYVNSVPHLKAIRFFIGAFEAPSYLAYQYLFGSFYKHDEMVRRSAFYYLGQYIGILSAGGIQSAVYSSLNGVNGLEGWRWNFIIDAIVSVVVGLIGFYSLPGDPYNCYSIFLTDDEIRLARKRLKENQTGKSDFETKVFDIKLWKTIFSDWKIYILTLWNIFCWNDSNVSSGAYLLWLKSLKRYSIPKLNQLSMITPGLGMVYLMLTGIIADKLHSRWFAIIFTQVFNIIGNSILAAWDVAEGAKWFAFMLQCFGWAMAPVLYSWQNDICRRDAQTRAITLVTMNIMAQSSTAWISVLVWKTEEAPRYLKGFTFTACSAFCLSIWTFVVLYFYKRDERNNAKKNGIVLYNSKHGVEKPTSKDVETLSVSDEK*'.
For YAL066W the reverse complement sequence is 'MLSLVKRSILHSIPITRHILPIQLILVKMNHVQIRNIKLYHFISYGFMLTKLTVFLFNLFFYRLRILCRLTLLILSLPVQIYIKEIQTKMLEKHTASDTSCI*'.
For YAL065C the reverse complement sequence is 'MNSATSETTTNTGAAETTTSTGAAETKTVVTSSISRFNHAETQTASATDVIGHSSSVVSVSETGNTKSLITSGLSTMSQQPRSTPASSIIGSSTASLEISTYVGIANGLLTNNGISVFISTVLLAIVW*'.
For YAL064W-B the reverse complement sequence is 'MAGEAVSEHTPDSQEVTVTSVVCCLDSVVEIGHHVVYSVVTPLIVAVLIDTMAGEAVLEHTSDSQEEIVTTVVCSVVPLVCFVVSVVCFVISVVEIGHHVVYSVVAPLTVTVAVETIAEEMDSVHT*'.
For YAL064C-A the reverse complement sequence is 'MTGYFLPPQTSSYTFRFAKVDDSAILSVGGDVAFGCCAQEQPPITSTNFTINGIKPWQGRLPDNIAGTVYMYAGFYCPMKIVYSNAVSWHTLPVSVELPDVTTVSDDFAGHVYSFDDDLTAQLYYP*'.
For YAL064W the reverse complement sequence is 'MNPFASLEGQDNISSVFFLHMQQFESQVKDRFRFPIFRLERKTFGNSCYQVETLKVKCRPRHAKSCNLLTLLFKSRTQSVLVPNFGFLILNSEP*'.

# reading in a GFF file

# set necessary variables
gff_url = ("ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Bacteria/Halobacterium_sp_uid217/AE004437.gff")
gff_filename = "AE004437.gff"

# download file
urlretrieve(gff_url, gff_filename)


# gff files don't contain column names, so we have to manually provide those
col_names = ["sequence_id", "source", "feature", "start", "end", "score", "strand", "phase", "attributes"]

# read the GFF file into a Pandas dataframe
AE004437_df = pd.read_csv(gff_filename, sep = '\t', names = col_names, skiprows = 5, skipfooter = 1, engine = 'python')
# gff files are tab-separated, so need to specify sep='\t' because read_csv will assume sep=',' by default
# the names = colnames argument allows us to set the column names on import
# skiprows here skips the first n rows of the dataframe on import
# skipfooter skips the last n lines of the file; the last line of the GFF file contains only hashtags ("#")
# engine = 'python' prevents skipfooter from raising an error

# examine the dataframe
AE004437_df

# use seaborn to generate a histogram of the gene lengths

# first, create a new column gene_length in the dataframe
AE004437_df["gene_length"] = AE004437_df.end - AE004437_df.start
print(AE004437_df)

# now plot the relevant data in a histogram
gene_length_plot = sns.histplot(data = AE004437_df[AE004437_df["feature"] == "gene"], x = "gene_length")
# data = AE004437_df[AE004437_df["feature"] == "gene"] will subset the Pandas dataframe to only the rows where feature == "gene"
# next add labels to the axes & a title
gene_length_plot.set_title('Gene Lengths in AE004437')
gene_length_plot.set_ylabel('Counts')
gene_length_plot.set_xlabel('Gene lengths')


# visualize the plot
gene_length_plot

# write the plot to a file using matplotlib
plt.savefig("AE004437_gene_lengths.svg")

     sequence_id   source feature    start      end score strand phase  \
0     AE004437.1  Genbank  region        1  2014239     .      +     .   
1     AE004437.1  Genbank    gene      248     1453     .      +     .   
2     AE004437.1  Genbank     CDS      248     1453     .      +     0   
3     AE004437.1  Genbank    gene     1450     2115     .      +     .   
4     AE004437.1  Genbank     CDS     1450     2115     .      +     0   
...          ...      ...     ...      ...      ...   ...    ...   ...   
4290  AE004437.1  Genbank     CDS  2007840  2009699     .      +     0   
4291  AE004437.1  Genbank    gene  2009709  2011541     .      -     .   
4292  AE004437.1  Genbank     CDS  2009709  2011541     .      -     0   
4293  AE004437.1  Genbank    gene  2011729  2014239     .      +     .   
4294  AE004437.1  Genbank     CDS  2011729  2014239     .      +     0   

                                             attributes  gene_length  
0     ID=id0;Name=ANONYMOUS;Dbxref=ATCC:700922,taxon...      2014238  
1     ID=gene0;Name=VNG_0001H;gbkey=Gene;locus_tag=V...         1205  
2     ID=cds0;Name=AAG18645.1;Parent=gene0;Note=Vng0...         1205  
3     ID=gene1;Name=yvrO;gbkey=Gene;gene=yvrO;locus_...          665  
4     ID=cds1;Name=AAG18646.1;Parent=gene1;Dbxref=NC...          665  
...                                                 ...          ...  
4290  ID=cds2055;Name=AAG20700.1;Parent=gene2107;Not...         1859  
4291  ID=gene2108;Name=VNG_2678H;gbkey=Gene;locus_ta...         1832  
4292  ID=cds2056;Name=AAG20701.1;Parent=gene2108;Not...         1832  
4293  ID=gene2109;Name=csg;gbkey=Gene;gene=csg;locus...         2510  
4294  ID=cds2057;Name=AAG20702.1;Parent=gene2109;Dbx...         2510  

[4295 rows x 10 columns]

# check for protein sequences in the GFF file

# now filter the dataset & sample it
sampled_df = AE004437_df[AE004437_df["feature"] == "gene"].attributes.str.split(";").sample(n = 4, random_state = 42)
# subset the dataframe as we did before
# one benefit of not leaving spaces in column names is that you can then use the formatting used here
# .attributes selects the "attributes" column of the subsetted dataframe
# specify that the column contains string information with .str
# split the strings into a list using the semicolons as the delimiter to split by
# randomly sample 4 rows from the column with .sample(n = 4, random_state = 42)
# the random_state argument to .sample sets the seed for reproducibility

# check the results
# ref: https://stackoverflow.com/questions/25351968/how-can-i-display-full-non-truncated-dataframe-information-in-html-when-conver
with pd.option_context('display.max_colwidth', None):
    # using this method prevents runcation of the column contents based on width
    display(sampled_df)

1112          [ID=gene544, Name=VNG_0686C, gbkey=Gene, locus_tag=VNG_0686C, old_locus_tag=VNG0686C]
4041         [ID=gene1986, Name=VNG_2523H, gbkey=Gene, locus_tag=VNG_2523H, old_locus_tag=VNG2523H]
861           [ID=gene420, Name=VNG_0533H, gbkey=Gene, locus_tag=VNG_0533H, old_locus_tag=VNG0533H]
1078    [ID=gene527, Name=trp4, gbkey=Gene, gene=trp4, locus_tag=VNG_0667G, old_locus_tag=VNG0667G]
Name: attributes, dtype: object

	Sepal_Length	Sepal_Width	Petal_Length	Petal_Width	Class
0	5.1	3.5	1.4	0.2	Iris-setosa
1	4.9	3.0	1.4	0.2	Iris-setosa
2	4.7	3.2	1.3	0.2	Iris-setosa
3	4.6	3.1	1.5	0.2	Iris-setosa
4	5.0	3.6	1.4	0.2	Iris-setosa
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	Iris-virginica
146	6.3	2.5	5.0	1.9	Iris-virginica
147	6.5	3.0	5.2	2.0	Iris-virginica
148	6.2	3.4	5.4	2.3	Iris-virginica
149	5.9	3.0	5.1	1.8	Iris-virginica

	Sepal_Length	Sepal_Width	Petal_Length	Petal_Width	Class
0	5.1	3.5	1.4	0.2	Iris-setosa
4	5.0	3.6	1.4	0.2	Iris-setosa
5	5.4	3.9	1.7	0.4	Iris-setosa
7	5.0	3.4	1.5	0.2	Iris-setosa
10	5.4	3.7	1.5	0.2	Iris-setosa
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	Iris-virginica
146	6.3	2.5	5.0	1.9	Iris-virginica
147	6.5	3.0	5.2	2.0	Iris-virginica
148	6.2	3.4	5.4	2.3	Iris-virginica
149	5.9	3.0	5.1	1.8	Iris-virginica

	Sepal_Length	Sepal_Width	Petal_Length	Petal_Width	Class
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	virginica
146	6.3	2.5	5.0	1.9	virginica
147	6.5	3.0	5.2	2.0	virginica
148	6.2	3.4	5.4	2.3	virginica
149	5.9	3.0	5.1	1.8	virginica

	Sepal_Length	Sepal_Width	Petal_Length	Petal_Width	Class	Petal_WdivD
0	5.1	3.5	1.4	0.2	setosa	7.000
1	4.9	3.0	1.4	0.2	setosa	7.000
2	4.7	3.2	1.3	0.2	setosa	6.500
3	4.6	3.1	1.5	0.2	setosa	7.500
4	5.0	3.6	1.4	0.2	setosa	7.000
...	...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	virginica	2.261
146	6.3	2.5	5.0	1.9	virginica	2.632
147	6.5	3.0	5.2	2.0	virginica	2.600
148	6.2	3.4	5.4	2.3	virginica	2.348
149	5.9	3.0	5.1	1.8	virginica	2.833

	Sepal_Length	Sepal_Width	Petal_Length	Petal_Width	Petal_WdivD
Class
setosa	5.006	3.418	1.464	0.244	7.07802
versicolor	5.936	2.770	4.260	1.326	3.24290
virginica	6.588	2.974	5.552	2.026	2.78060

BBT045: Intro to Python programming for data analysis¶

Intro¶

Getting help¶

Prep¶

Attributes & Data Types¶

Working with Python data structures¶

Lists¶

Tuples¶

Set¶

Dictionary¶

Iterating over data structures¶

`if` statements¶

`for` loops and/vs. `while` loops¶

Working with tabular data: Pandas¶

Filtering rows based on values¶

Selecting & working on columns¶

Working on groups of values¶

Lambda functions, etc.¶

Application: Reading FASTA files¶

Application: Working with GFF files¶

Quick counts and visualizations¶

Inspect attributes¶

	sequence_id	source	feature	start	end	score	strand	phase	attributes
0	AE004437.1	Genbank	region	1	2014239	.	+	.	ID=id0;Name=ANONYMOUS;Dbxref=ATCC:700922,taxon...
1	AE004437.1	Genbank	gene	248	1453	.	+	.	ID=gene0;Name=VNG_0001H;gbkey=Gene;locus_tag=V...
2	AE004437.1	Genbank	CDS	248	1453	.	+	0	ID=cds0;Name=AAG18645.1;Parent=gene0;Note=Vng0...
3	AE004437.1	Genbank	gene	1450	2115	.	+	.	ID=gene1;Name=yvrO;gbkey=Gene;gene=yvrO;locus_...
4	AE004437.1	Genbank	CDS	1450	2115	.	+	0	ID=cds1;Name=AAG18646.1;Parent=gene1;Dbxref=NC...
...	...	...	...	...	...	...	...	...	...
4290	AE004437.1	Genbank	CDS	2007840	2009699	.	+	0	ID=cds2055;Name=AAG20700.1;Parent=gene2107;Not...
4291	AE004437.1	Genbank	gene	2009709	2011541	.	-	.	ID=gene2108;Name=VNG_2678H;gbkey=Gene;locus_ta...
4292	AE004437.1	Genbank	CDS	2009709	2011541	.	-	0	ID=cds2056;Name=AAG20701.1;Parent=gene2108;Not...
4293	AE004437.1	Genbank	gene	2011729	2014239	.	+	.	ID=gene2109;Name=csg;gbkey=Gene;gene=csg;locus...
4294	AE004437.1	Genbank	CDS	2011729	2014239	.	+	0	ID=cds2057;Name=AAG20702.1;Parent=gene2109;Dbx...

BBT045: Intro to Python programming for data analysis¶

Intro¶

Getting help¶

Prep¶

Attributes & Data Types¶

Working with Python data structures¶

Lists¶

Tuples¶

Set¶

Dictionary¶

Iterating over data structures¶

if statements¶

for loops and/vs. while loops¶

Working with tabular data: Pandas¶

Filtering rows based on values¶

Selecting & working on columns¶

Working on groups of values¶

Lambda functions, etc.¶

Application: Reading FASTA files¶

Application: Working with GFF files¶

Quick counts and visualizations¶

Inspect attributes¶

`if` statements¶

`for` loops and/vs. `while` loops¶