Collect data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#/work/GIF/remkv6/USDA/17_OrthofinderTreeSecretome/01_GFFandProteins

#Collect protein fastas and gffs for each plant parasitic nematode

#From wormbase
bursaphelenchus_xylophilus.PRJEA64437.WBPS10.protein.fa
ditylenchus_destructor.PRJNA312427.WBPS10.protein.fa
globodera_pallida.PRJEB123.WBPS10.protein.fa
globodera_rostochiensis.PRJEB13504.WBPS10.protein.fa
meloidogyne_floridensis.PRJEB6016.WBPS10.protein.fa
meloidogyne_hapla.PRJNA29083.WBPS10.protein.fa
meloidogyne_incognita.PRJEA28837.WBPS10.protein.fa
bursaphelenchus_xylophilus.PRJEA64437.WBPS11.annotations.gff3
meloidogyne_javanica.PRJEB8714.WBPS11.annotations.gff3
meloidogyne_javanica.PRJEB8714.WBPS11.protein.fa
meloidogyne_incognita.PRJEB8714.WBPS11.annotations.gff3
meloidogyne_hapla.PRJNA29083.WBPS11.annotations.gff3
meloidogyne_floridensis.PRJEB6016.WBPS11.annotations.gff3
meloidogyne_arenaria.PRJEB8714.WBPS11.annotations.gff3
meloidogyne_arenaria.PRJEB8714.WBPS11.protein.fa
globodera_rostochiensis.PRJEB13504.WBPS11.annotations.gff3
globodera_pallida.PRJEB123.WBPS11.annotations.gff3
ditylenchus_destructor.PRJNA312427.WBPS11.annotations.gff3

#This is one I created from G. ellingtonae genome using RNAseq with braker
cp /work/GIF/remkv6/Baum/CamTechGenomeComparison/26_GloboderaSynteny/otherGloboderaGenomes/ellingtonae/braker/GCA_001723225.1_ASM172322v1_genomic/augustus.gff3 G.ellingtonae.gff3
cp /work/GIF/remkv6/Baum/CamTechGenomeComparison/26_GloboderaSynteny/otherGloboderaGenomes/ellingtonae/braker/GCA_001723225.1_ASM172322v1_genomic/augustus.aa G.ellingtonae.protein.fa

Published in BIOXIV at this time.
cp /work/GIF/remkv6/Baum/CamTechGenomeComparison/58_Renamatorium/1_genomeNgff/augustus.aa H.glycines.protein.fa
cp /work/GIF/remkv6/Baum/CamTechGenomeComparison/58_Renamatorium/1_genomeNgff/fixed.augustus.gff3 H.glycines.gff3

Rename genes and gff so that they are reasonably easy to work with

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#had to make modifications to names of proteins to get the gff and protein.fa to agree
sed 's/\.t/\./g' H.glycines.protein.fa |sed 's/Hetgly\.G/Hetgly\.T/g'  > H.glycinesFixed.protein.fa

module load maker
/shared/software/GIF/programs/maker/2.31.8b/bin/maker_map_ids --prefix Hgly --justify 8 H.glycines.gff3 >H.glycines.map
/shared/software/GIF/programs/maker/2.31.8b/bin/map_fasta_ids H.glycines.map H.glycinesFixed.protein.fa
/shared/software/GIF/programs/maker/2.31.8b/bin/map_gff_ids H.glycines.map H.glycines.gff3

##Run this for each gff and protein fasta

#B.xylophilus
sed 's/transcript://g' B.xylophilus.gff3 |sed 's/exon://g' |sed 's/gene://g' |sed 's/cds://g' >B.xylophilusFixed.gff3
/shared/software/GIF/programs/maker/2.31.8b/bin/maker_map_ids --prefix Bxyl. --justify 8 B.xylophilusFixed.gff3 >B.xylophilus.map
/shared/software/GIF/programs/maker/2.31.8b/bin/map_fasta_ids B.xylophilus.map B.xylophilus.protein.fa

#D. destructor
sed 's/transcript://g' D.destructor.gff3 |sed 's/exon://g' |sed 's/gene://g' |sed 's/cds://g' >D.destructorFixed.gff3
/shared/software/GIF/programs/maker/2.31.8b/bin/maker_map_ids --prefix Ddes. --justify 8 D.destructorFixed.gff3 >D.destructor.map
/shared/software/GIF/programs/maker/2.31.8b/bin/map_fasta_ids D.destructor.map D.destructor.protein.fa
/shared/software/GIF/programs/maker/2.31.8b/bin/map_gff_ids D.destructor.map D.destructorFixed.gff3

#G. pallida
sed 's/transcript://g' G.pallida.gff3 |sed 's/exon://g' |sed 's/gene://g' |sed 's/cds://g' >G.pallidaFixed.gff3
/shared/software/GIF/programs/maker/2.31.8b/bin/maker_map_ids --prefix Gpal. --justify 8 G.pallidaFixed.gff3 >G.pallida.map
/shared/software/GIF/programs/maker/2.31.8b/bin/map_fasta_ids G.pallida.map G.pallida.protein.fa
/shared/software/GIF/programs/maker/2.31.8b/bin/map_gff_ids G.pallida.map G.pallidaFixed.gff3

#G. rostochiensis
sed 's/transcript://g' G.rostochiensis.gff3 |sed 's/exon://g' |sed 's/gene://g' |sed 's/cds://g' >G.rostochiensisFixed.gff3
/shared/software/GIF/programs/maker/2.31.8b/bin/maker_map_ids --prefix Gros. --justify 8 G.rostochiensisFixed.gff3 >G.rostochiensis.map
/shared/software/GIF/programs/maker/2.31.8b/bin/map_fasta_ids G.rostochiensis.map G.rostochiensis.protein.fa
/shared/software/GIF/programs/maker/2.31.8b/bin/map_gff_ids G.rostochiensis.map G.rostochiensis.gff3

#G.ellingtonae
cp /work/GIF/remkv6/Baum/CamTechGenomeComparison/26_GloboderaSynteny/otherGloboderaGenomes/ellingtonae/braker/GCA_001723225.1_ASM172322v1_genomic/augustus.gff3 G.ellingtonae.gff3
cp /work/GIF/remkv6/Baum/CamTechGenomeComparison/26_GloboderaSynteny/otherGloboderaGenomes/ellingtonae/braker/GCA_001723225.1_ASM172322v1_genomic/augustus.aa G.ellingtonae.protein.fa


less G.ellingtonae.gff3 |sed 's/\.exon.*;/;/g' |sed 's/\.gene.*;/;/g' |sed 's/\.CDS.*;/;/g' | sed 's/\.transcript.*;/;/g' > G.ellingtonaeFixed.gff3
/shared/software/GIF/programs/maker/2.31.8b/bin/maker_map_ids --prefix Gell. --justify 8 G.ellingtonaeFixed.gff3 > G.ellingtonae.map
/shared/software/GIF/programs/maker/2.31.8b/bin/map_fasta_ids G.ellingtonae.map G.ellingtonae.protein.fa
/shared/software/GIF/programs/maker/2.31.8b/bin/map_gff_ids G.ellingtonae.map G.ellingtonaeFixed.gff3

#M. arenaria
sed 's/transcript://g' M.arenaria.gff3 |sed 's/exon://g' |sed 's/gene://g' |sed 's/cds://g' >M.arenariaFixed.gff3
/shared/software/GIF/programs/maker/2.31.8b/bin/maker_map_ids --prefix Mare --justify 8 M.arenariaFixed.gff3 >M.arenaria.map

/shared/software/GIF/programs/maker/2.31.8b/bin/map_gff_ids  M.arenaria.map M.arenariaFixed.gff3
/shared/software/GIF/programs/maker/2.31.8b/bin/map_fasta_ids  M.arenaria.map M.arenaria.protein.fa

#M. hapla
sed 's/transcript://g' M.hapla.gff3 |sed 's/exon://g' |sed 's/gene://g' |sed 's/cds://g' >M.haplaFixed.gff3
/shared/software/GIF/programs/maker/2.31.8b/bin/maker_map_ids --prefix Mhap --justify 8 M.haplaFixed.gff3 >M.hapla.map
/shared/software/GIF/programs/maker/2.31.8b/bin/map_fasta_ids  M.hapla.map M.hapla.protein.fa
/shared/software/GIF/programs/maker/2.31.8b/bin/map_gff_ids  M.hapla.map M.haplaFixed.gff3

#M.floridensis
less M.floridensis.gff3 |sed 's/transcript://g' |sed 's/exon://g' |sed 's/gene://g' |sed 's/cds://g' > M.floridensisFixed.gff3
/shared/software/GIF/programs/maker/2.31.8b/bin/maker_map_ids --prefix Mflo --justify 8 M.floridensisFixed.gff3 >M.floridensis.map

/shared/software/GIF/programs/maker/2.31.8b/bin/map_fasta_ids M.floridensis.map M.floridensis.protein.fa
/shared/software/GIF/programs/maker/2.31.8b/bin/map_gff_ids M.floridensis.map M.floridensisFixed.gff3

#M.incognita
less M.incognita.gff3 |sed 's/transcript://g' |sed 's/exon://g' |sed 's/gene://g' |sed 's/cds://g' > M.incognitaFixed.gff3
/shared/software/GIF/programs/maker/2.31.8b/bin/maker_map_ids --prefix Minc --justify 8 M.incognitaFixed.gff3 >M.incognita.map

##The M.incognita gff had scaffold names attached to gene names, so I had to make a separate map.
less M.incognita.map |sed 's/g/\tMinc/g' |cut -f 2,3 >M.incognitaFasta.map

#While this was successful for primary transcripts, it did not change the alternative isoforms.  
/shared/software/GIF/programs/maker/2.31.8b/bin/map_fasta_ids M.incognitaFasta.map M.incognita.protein.fa
##removing alternative isoforms that were left in the protein fasta file
module load cdbfasta
cdbfasta M.incognita.protein.fa
#only getting primary isoforms, all of which end with "RA"
grep ">" M.incognita.protein.fa |grep "RA" |sed 's/>//g' |cdbyank M.incognita.protein.fa.cidx >M.incognitaFixed.protein.fa

##This map and gff worked fine without  modification to the map, but still had cds, exon, mrna, and gene attached to gene names
less M.incognita.gff3 |sed 's/transcript://g' |sed 's/exon://g' |sed 's/gene://g' |sed 's/cds://g' > M.incognitaFixed.gff3
/shared/software/GIF/programs/maker/2.31.8b/bin/map_gff_ids M.incognita.map M.incognitaFixed.gff3

# M.javanica
less M.javanica.gff3  |sed 's/transcript://g' |sed 's/exon://g' |sed 's/gene://g' |sed 's/cds://g' > M.javanicaFixed.gff3
/shared/software/GIF/programs/maker/2.31.8b/bin/maker_map_ids --prefix Mjav --justify 8 M.javanicaFixed.gff3 >M.javanica.map
/shared/software/GIF/programs/maker/2.31.8b/bin/map_fasta_ids  M.javanica.map M.javanica.protein.fa
/shared/software/GIF/programs/maker/2.31.8b/bin/map_gff_ids  M.javanica.map M.javanicaFixed.gff3

Gather primary protein isoforms and run orthofinder

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#/work/GIF/remkv6/USDA/17_OrthofinderTreeSecretome/02_orthofinder
for f in ../01_GFFandProteins/*protein.fa; do ln -s $f; done

#this one has the original names, so we dont want it
unlink M.incognita.protein.fa
unlink H.glycines.protein.fa

#gettting ready to extract only primary isoforms
module load cdbfasta
for f in *fa; do cdbfasta $f;done

#grab the fasta ids, RA is on every primary isoform, get rid of ">", because cdbyank needs it removed
for f in *fa; do grep ">" $f |grep "RA" |sed 's/>//g' |cdbyank $f.cidx >Primary$f; done

#move all primary proteins to a new folder and run orthofinder
#just a note, the reason we want primary is that orthofinder will call alternative spliced isoforms as duplicated, and so flaw the analysis
mkdir 01_orthofinderRun
cd 01_orthofinderRun/
for f in ../Primary*fa; do ln -s $f;done
printf "module load orthofinder\northofinder -t 16 -a 16  -f 01_orthofinderRun/\n" >orthofinder.sh
################################################################
module load orthofinder
orthofinder -t 16 -a 16 -f 01_orthofinderRun/
################################################################

Run signalp to get proteins with a signal peptide and lacking a transmembrane domain (secreted proteins)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#/work/GIF/remkv6/USDA/17_OrthofinderTreeSecretome/03_SignalP
#softlink all of the protein files
for f in ../02_orthofinder/01_orthofinderRun/*fa; do ln -s $f;done

#need to split up the protein fasta so that there are fewer than 10,000 proteins for each run, otherwise signalp complains
fasta-splitter.pl --n-parts 10 PrimaryM.hapla.protein.fa

#run signalp on all of the split fasta files
module load signalp

## Im using the notm database, outputting a signal peptide file, and feeding the output from each process into the same file. This only takes a couple minutes for 20k proteins
for f in PrimaryM.hapla.protein.part*; do echo "signalp -s notm -m -V "$f" >>"${f%%.*}".SignalP.out &"; done >M.haplaSignalp.sh
sh M.haplaSignalp.sh

for f in *fa; do fasta-splitter.pl --n-parts 20 $f ;done

#run signalp for each split fasta file, all the extra commands are making sure that the output files are the same for each species
for f in *part*fa; do echo "signalp -s notm -m -V "$f" >>"${f%.*}".signalp.out &";done |sed 's/\.part/\t/2' |sed 's/\.signalp.out/\t\.signalp.out/g' |sed '0~15 s/$/\nwait/g' >signalp.sh

Run orthofinder on secreted proteins

1
2
3
4
5
6
7
8
9
# softlink the secreted protein fastas to subfolder below
#/work/GIF/remkv6/USDA/17_OrthofinderTreeSecretome/04_OrthofinderSignalP/01_orthofinderRun
for f in ../../03_SignalP/*secreted.fa; do ln -s $f;done

#/work/GIF/remkv6/USDA/17_OrthofinderTreeSecretome/04_OrthofinderSignalP
module load orthofinder
orthofinder -t 16 -a 16 -f 01_orthofinderRun/