Negativicutes example

= Mount shared folder, VM = sudo mkdir /media/BiotoolsShared-virtual sudo mount -t vboxsf myShare /media/BiotoolsShared-virtual foreach i (*.fsa ) echo $i saco_convert -I Fasta -O tab $i > $i:r.tab sed -r '/\t\t\t/d' $i:r.tab > $i.temp.tab saco_convert -I tab -O Fasta $i.temp.tab > $i rm *tab end
 * remove zero length entries from FASTA

= Download genomes = for x in CP001859 CP003058 FP929048 HE576794 CP002637 CP001820 ACGB00000000 AFHQ00000000 ACIM00000000 AFBB00000000 AENT00000000 ADGP00000000 AECS00000000 AFUG00000000 AFIJ00000000 ABWK00000000 AEVN00000000 AECV00000000 ACLA00000000 ACKT00000000 AENV00000000 AEEJ00000000 ACKP00000000 AAWL00000000 AEDR00000000 AEDS00000000 ACIK00000000 ADFU00000000 ADCV00000000 ADCW00000000 AENU00000000 AFUJ00000000 do getgbk -a $x > $x.gbk done for x in *gbk do extractname $x done for x in CP001859 CP003058 FP929048 HE576794 CP002637 CP001820 ACGB00000000 AFHQ00000000 ACIM00000000 AFBB00000000 AENT00000000 ADGP00000000 AECS00000000 AFUG00000000 AFIJ00000000 ABWK00000000 AEVN00000000 AECV00000000 ACLA00000000 ACKT00000000 AENV00000000 AEEJ00000000 ACKP00000000 AAWL00000000 AEDR00000000 AEDS00000000 ACIK00000000 ADFU00000000 ADCV00000000 ADCW00000000 AENU00000000 AFUJ00000000 do rm $x.gbk done
 * Download genomes from ACC or WGS numbers
 * Rename files after organism name in GenBank file and ACC or WGS number
 * Procedure copies files
 * Remove files named only as ACC or WGS numbers.
 * Procedure leaves files with organism names

= Extract DNA from GenBank = for x in *gbk do saco_convert -I genbank -O fasta $x > $x.fna done for x in *gbk.fna do newx=`echo $x | sed "s/.gbk.fna/.fna/"` mv $x $newx done
 * Extract DNA from Origin field of GenBank file
 * Rename files from *gbk.fna to *.fna

= 16S rRNA analysis = for x in *fna do echo $x rnammer -S bac -m ssu -f $x.rrna $x done grep -c ">" *rrna | awk 'BEGIN{FS=":"}{sum += $2}END { print sum}' for x in *fna.rrna do newx=`echo $x | sed "s/.fna.rrna/.rrna/"` mv $x $newx done select16SrRNA -out all.fna
 * Search DNA string for patterns similar to rRNA models in RNAmmer
 * Count total number of sequences in a set of rrna files:
 * Rename files from *fna.rrna to *.rrna
 * Extract one 16S rRNA sequence from each genome
 * Goes trough each file in the directory called *rrna

Selecting the sequence with highest score and length between 1400 and 2000 Default is 1400 - 2000 base pairs, change using -min and -max

Program is writing to file called all.fna Default output file is selected16SrRNA.fna, change using -out

Number of files to be evaluated (extension *.rrna): 32 Number of files with no sequences: 2

Score: 1920.1 :: Length 1545 :: Acidaminococcus_sp_D21_ID_ACGB00000000.rrna Score: 1836.1 :: Length 1557 :: Dialister_invisus_DSM_15470_ID_ACIM00000000.rrna Score: 1878.8 :: Length 1555 :: Dialister_micraerophilus_DSM_19965_ID_AFBB00000000.rrna Unacceptable length: 1325 :: /molecule=16s_rRNA /score=1197.2 :: Dialister_microaerophilus_UPII_345-E_ID_AENT00000000.rrna ............ select16SrRNA -out all.fna | grep Unacc | wc = 6 select16SrRNA -out all.fna -min 1100 | grep -c Unacc = 0 clustalw all.fna clustalw all.fna -bootstrap=1000 njplot all.phb Postscript file was made pretty manually in texteditor
 * Using default setting, 6 sequence do not have the right length (1400-1800)
 * Using a smaller minimum length, all 29 genomes have an acceptable 16S rRNA sequence
 * Performing a multiple alignment of the selected sequences:
 * Save file as post script

= Genome atlas =

mkdir GenomeAtlas Vparvula sed s/XXXX/Veillonella_parvula_DSM_2008_ID_CP001820/g /usr/biotools/genomeAtlas > Vparvula.genomeatlas.sh chmod +x Vparvula.genomeatlas.sh ./Vparvula.genomeatlas.sh
 * Atlases were created for complete genomes= *For each atlas a folder was created to contain the files for that atlas
 * The files are saved but can be deleted as soon as the atlas has been drawn

= Identify open readingframes in DNA using Prodigal = for x in *fna do prodigalrunner $x done

= Bias in third codon position and Amino acid and codon usage =
 * Calculate AT and amino acid usage in open reading frame files, FASTA format

for i in *orf.fna do perl /usr/biotools/indirect/atStats.pl $i > $i.atStats.tab cat $i.atStats.tab > $i.CodonAaUsage perl /usr/biotools/indirect/CodonAaUsage.pl $i >> $i.CodonAaUsage rm $i.atStats.tab done grep aa *AaUsage > aaUsage.all sed -i s/_prodigal.orf.fna.CodonAaUsage:aa//g aaUsage.all grep Total *AaUsage > statistics.all sed -i 's/_prodigal.orf.fna.CodonAaUsage:/\t/g' statistics.all cut -f2,3,4,5,6,7,8 statistics.all > tmp.all mv tmp.all statistics.all sed -i 's/_prodigal.orf.fna//g' statistics.all grep codon *AaUsage > codonUsage.all sed -i s/_prodigal.orf.fna.CodonAaUsage:codon//g codonUsage.all

= Drawing heatmaps of Bias in third codon position and Amino acid and codon usage = install.packages("gplots") library(gplots) codon <- read.table("codonUsage.all") colnames(codon) <- c( 'Name', 'codon', 'score', 'count') codon <- codon[1:3] test <- reshape(codon, idvar="Name", timevar="codon", direction="wide") codonMatrix <- data.matrix(test[2:length(test)]) rownames(codonMatrix) <- test$Name codon_heatmap <- heatmap.2(codonMatrix, scale="none", main="Codon usage", xlab="Codon fraction", ylab="Organism", trace="none", margins=c(8, 25)) dev.print(postscript, "codonUsage.ps", width = 25, height=25) dev.off

library(gplots) aa <- read.table("aaUsage.all") colnames(aa) <- c( 'Name', 'aa', 'score') test <- reshape(aa, idvar="Name", timevar="aa", direction="wide") aaMatrix <- data.matrix(test[2:length(test)]) rownames(aaMatrix) <- test$Name stat_heatmap <- heatmap.2(aaMatrix, scale="none", main="Amino acid usage", xlab="Amino acid fraction", ylab="Organism", trace="none", margins=c(8, 25), col = cm.colors(256)) dev.print(postscript, "aaUsage.ps", width = 25, height=25) dev.off

= Bar and rose plots for amino acid, codon usage and bias in third position =

for x in *orf.fna do basicgenomeanalysis $x /usr/bin/gnuplot done

= BLAST martix = makebmdest. > bmdest.xml blastmatrix -cpu 5 bmdest.xml > blastmatrix.ps
 * Input file is created in directory with protein files in FASTA format
 * Takes all fsa files in directory

= Core and pan genome analysis, single linkage clustering = ls -1 *orf.fsa | gawk '{print $1 "\t" $1}' > pancore.list pancoreplot pancore.list
 * Input file is created in directory with protein files in FASTA format
 * The output is a post script file and a table written to the screen =

= Genome statistics = for x in *orf.fna do genomeStatistics $x >> all.stats done