#!/usr/local/bin/perl # # read gene association file from SGD GO and output two # sequence files: nucleic acid and amino acid. # # use strict; my $external_table = "/share/ftp/yeast/data_dump/external_id/external_id.tab"; my $output = ">gp2protein.sgd"; my %records; my %swiss; open (EXT_IN, "<$external_table") || die "Cannot open $external_table\n"; while () { chop; my ($extid, $type, $orfname, $sgdid) = split('\t'); if ($type =~ /SwissProt/) { $swiss{$sgdid} = $extid; } } close (EXT_IN); open (GP2P, "$output") || die "Cannot open $output\n"; # # printout the date and version lines in two parts so that RCS will not # remove the \ escape on the $. # print GP2P "!date: \$Date: "; print GP2P " \$\n"; print GP2P "!version: \$Revision: "; print GP2P " \$ \n!\n"; print GP2P "!Sacchharomyces Genome Database\n"; print GP2P "!SGDID mapped to SwissPROT/TrEMBL Accessions\n"; print GP2P "!\n"; while (<>) { chop; my $line = uc($_); if ($line =~ /^[SL]/) { # my ($sgdid) = split(/\t/, $line, 1); my $sgdid = $_; if ($swiss{$sgdid}) { # # for now include SGD: in the first column print GP2P "SGD:" . $sgdid . "\tSWISS-PROT:" . $swiss{$sgdid} . "\n"; } } } close (GP2P); exit 1;