Author: Laszlo Kajan <lkajan@rostlab.org>
Description: patch to handle iterated blast
Forwarded: no

--- libzerg.orig/zerglexer.lex
+++ libzerg/zerglexer.lex
@@ -35,10 +35,11 @@
 static char _zerg_internal_buffer[INTERNAL_BUFFER_LENGTH];
 static char* _zerg_description_score;
 static char* _zerg_description_evalue;
+static int _zerg_tail_of_rep_len=0;
 
-static int _zerg_ignore[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+//                           0                   10                  20                  30                  40                  50                  60               
+static int _zerg_ignore[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
 static char* _zerg_value;
-static int _zerg_have_hsp=0;
 
 /* _zerg_blast_type  0:blastn 1:blastp 2:blastx 3:tblastn 4:tblastx */
 static int _zerg_blast_type=0;
@@ -48,6 +49,15 @@
 #define RET(SC,VALUE,CODE) BEGIN(SC);\
  if(!_zerg_ignore[CODE]) {_zerg_value=VALUE; return CODE;}
 
+
+static void begin_tail_of_report( size_t __sc ) // the argument is here to make it clearer what start condition we switch to, and to make it show up better in searches
+{
+    _zerg_tail_of_rep_len = 0; _zerg_internal_buffer[0] = 0;
+    //
+    BEGIN(__sc);
+}
+
+
 static void _zerg_strip_double_spaces()
 {
   char *p1, *p2;
@@ -117,6 +127,7 @@
 %x s_descriptions
 %x s_evalue
 %x s_evalue_2
+%x s_hsp_method
 %x s_gaps
 %x s_gaps_2
 %x s_hsp
@@ -126,6 +137,9 @@
 %x s_percent_identities_2
 %x s_pre_end
 %x s_pre_hits
+%x s_pre_hits_2
+%x s_round
+%x s_round_seq_new
 %x s_query_1
 %x s_query_2
 %x s_query_3
@@ -138,6 +152,10 @@
 %x s_query_orientation_2
 %x s_query_start
 %x s_query_start_2
+%x s_query_ali
+%x s_query_ali_2
+%x s_subject_ali
+%x s_subject_ali_2
 %x s_reference
 %x s_score
 %x s_score_2
@@ -155,7 +173,7 @@
 %x s_subject_orientation_2
 %x s_subject_start
 %x s_subject_start_2
-%x s_end_of_report
+%x s_tail_of_report
 %x s_positives
 %x s_positives_2
 %x s_positives_3
@@ -166,14 +184,10 @@
 %x s_frame_5
 %x s_gaps_3
 %x s_pre_subject_name
-%x s_pre_score
-%x s_pre_end_of_report_1
-%x s_pre_end_of_report_2
-%x s_pre_end_of_report_3
 
 %%
 
-<INITIAL,s_end_of_report>T?BLAST.+         {
+<INITIAL>T?BLAST.+         {
   
   if(!strncmp(yytext,"BLASTN",6)) _zerg_blast_type=0;
   else if(!strncmp(yytext,"BLASTP",6)) _zerg_blast_type=1;
@@ -185,8 +199,8 @@
   }
 
 
-<s_blast_version>\n+      BEGIN(s_reference);
-<s_reference>"Reference:"(.+\n)+\n  BEGIN(s_query_1);
+<s_blast_version>\n+                  BEGIN(s_reference);
+<s_reference,s_query_1>Reference(.+\n)+\n+      RET(s_query_1,yytext,REFERENCE);
 <s_query_1>"Query= "      BEGIN(s_query_2);
 <s_query_2>[^ \t\n]+        RET(s_query_3,yytext,QUERY_NAME);
 
@@ -209,12 +223,14 @@
                        }
 
 <s_query_6>.+\n\n      BEGIN(s_database);
-<s_database>Database:(.+\n)+\n BEGIN(s_searching);
-<s_searching>Searching"."*done\n+  BEGIN(s_pre_hits);
-<s_pre_hits>.+"No hits found".+\n\n  RET(s_pre_end,yytext,NOHITS);
-<s_pre_hits>[ \t]+Score.+\n.+\n\n    BEGIN(s_descriptions);
-
-<s_descriptions>[^ \t\n]+  RET(s_description_annotation,yytext,DESCRIPTION_HITNAME);
+<s_database>Database:(.+\n)+\n RET(s_searching,yytext,DATABASE);
+<s_searching,s_hsp>Searching"."*done\n+  RET(s_pre_hits,yytext,SEARCHING);
+<s_pre_hits>.+"No hits found".+\n\n  RET(s_hsp,yytext,NOHITS);
+<s_pre_hits>^"Results from round"" "* BEGIN(s_round);
+<s_round>[[:digit:]]+	 RET(s_pre_hits,yytext,ROUND_NUMBER);
+<s_pre_hits>[ \t]+Score[[:space:]]+E.*\n.+\n    BEGIN(s_pre_hits_2);
+<s_pre_hits_2>^\n      BEGIN(s_descriptions);
+<s_pre_hits_2>^"Sequences used in model and found again:"\n     RET(s_pre_hits_2,yytext,ROUND_SEQ_FOUND)
 
 <s_description_annotation>.+\n {
   
@@ -258,8 +274,9 @@
       
       if(annotation_length+score_length+evalue_length+4 > INTERNAL_BUFFER_LENGTH)
 	{
-	  fprintf(stderr, "buffer overflow\n");
-	  exit(1);
+    _zerg_internal_buffer[INTERNAL_BUFFER_LENGTH-1] = 0;
+	  fprintf(stderr, "zerg buffer overflow: '%s'\n", _zerg_internal_buffer);
+	  return BUFFER_OVERFLOW;
 	}
 
       pp=_zerg_internal_buffer;
@@ -288,21 +305,14 @@
 <s_description_score>"~"        RET(s_description_evalue,_zerg_description_score,DESCRIPTION_SCORE);
 <s_description_evalue>"~"       RET(s_descriptions,_zerg_description_evalue,DESCRIPTION_EVALUE);
 
+<s_pre_hits_2,s_descriptions>\n"Sequences not found previously or not previously below threshold:"\n\n     RET(s_descriptions,yytext,ROUND_SEQ_NEW);
+<s_descriptions>[^ \t\n>]+      RET(s_description_annotation,yytext,DESCRIPTION_HITNAME);
 <s_descriptions>\n              BEGIN(s_pre_hits);
 
-<s_pre_hits>>             BEGIN(s_subject_name);
-<s_hsp>> {
-    if(_zerg_have_hsp)
-    {
-      _zerg_have_hsp=0;
-      unput('~');
-      RET(s_pre_subject_name,_zerg_internal_buffer,QUERY_END);
-    }
-    else
-    {
-      BEGIN(s_subject_name);
-    }
-  }
+<s_descriptions,s_pre_hits>\n"CONVERGED!"\n   RET(s_pre_hits,yytext,CONVERGED);
+<s_descriptions,s_pre_hits>>                  BEGIN(s_subject_name);
+
+<s_hsp>>                  BEGIN(s_subject_name);
 
 <s_pre_subject_name>"~" RET(s_subject_name,_zerg_internal_buffer+32,SUBJECT_END);
 
@@ -325,32 +335,20 @@
 
 <s_subject_length_2>\n+         BEGIN(s_hsp);
 
-<s_hsp>" Score ="" "+       {
-    if(_zerg_have_hsp)
-    {
-      _zerg_have_hsp=0;
-      unput('~');
-      RET(s_pre_score,_zerg_internal_buffer,QUERY_END);
-    }
-    else
-    {
-      BEGIN(s_score);
-    }
-  }
-
-<s_pre_score>"~" RET(s_score,_zerg_internal_buffer+32,SUBJECT_END);
-
+<s_hsp>" Score ="" "+           BEGIN(s_score);
 
 <s_score>[^ \t\n]+              RET(s_score_2,yytext,SCORE_BITS);
 <s_score_2>" bits ("            BEGIN(s_score_parentheses);
 <s_score_parentheses>[0-9]+            RET(s_score_parentheses_2,yytext,SCORE);
-<s_score_parentheses_2>"), Expect"("("[0-9]+")")?" ="" "+  BEGIN(s_evalue);
-<s_evalue>[^ \t\n]+             {
+<s_score_parentheses_2>"), Expect"(\([0-9]+\))?" ="" "+  BEGIN(s_evalue);
+<s_evalue>[^ \t\n,]+             {
      _zerg_fix_evalue();
      RET(s_evalue_2,_zerg_internal_buffer,EVALUE);
                                 }
 
+<s_evalue_2>",   Method:"[ ]*	BEGIN(s_hsp_method);
 <s_evalue_2>\n" Identities = "  BEGIN(s_identities);
+<s_hsp_method>[^\n]+		RET(s_evalue_2,yytext,HSP_METHOD);
 <s_identities>[0-9]+            RET(s_identities_2,yytext,IDENTITIES);
 <s_identities_2>"/"             BEGIN(s_alignment_length);
 <s_alignment_length>[0-9]+      RET(s_alignment_length_2,yytext,ALIGNMENT_LENGTH);
@@ -398,108 +396,108 @@
 <s_gaps_3,s_subject_orientation_2,s_frame_5>\n+    BEGIN(s_hsp);
 
 <s_hsp>(" "*\n)?"Query:"" "+       BEGIN(s_query_start);
-<s_query_start>[0-9]+           {
-  if(_zerg_have_hsp)
-  {
-    BEGIN(s_query_start_2);
-  }
-  else
-  {
-    RET(s_query_start_2,yytext,QUERY_START);
-  }
- }
-
-<s_query_start_2>" "+[^ \t]+" "+ BEGIN(s_query_end);
-<s_query_end>[0-9]+             {
-  if(!_zerg_ignore[QUERY_END])
-  {
-    char *p1, *p2;
-    p1=yytext;
-    p2=_zerg_internal_buffer;
-    while(*p1 && p2-_zerg_internal_buffer < 32-1)
-      *(p2++)=*(p1++);
-    *p2='\0';
-  }
-  BEGIN(s_query_end_2);
-  }
-
-<s_query_end_2>\n.+\n"Sbjct:"" "+ BEGIN(s_subject_start);
-<s_subject_start>[0-9]+         {
-  if(_zerg_have_hsp)
-  {
-    BEGIN(s_subject_start_2);
-  }
-  else
-  {
-    _zerg_have_hsp=1;
-    RET(s_subject_start_2,yytext,SUBJECT_START);
-  }
- }
 
-<s_subject_start_2>" "*[^ \t]+" "+ BEGIN(s_subject_end);
-<s_subject_end>[0-9]+          {
-  if(!_zerg_ignore[SUBJECT_END])
-  {
-    char *p1, *p2;
-    p1=yytext;
-    p2=_zerg_internal_buffer+32;
-    while(*p1 && p2-(_zerg_internal_buffer+32) < 32-1)
-      *(p2++)=*(p1++);
-    *p2='\0';
-  }
-  BEGIN(s_subject_end_2);
-  }
+<s_query_start>[0-9]+    RET(s_query_start_2,yytext,QUERY_START);
+<s_query_start_2>[ ]+           BEGIN(s_query_ali);
+<s_query_ali>[[:alpha:]-]+      RET(s_query_ali_2,yytext,QUERY_ALI);
+<s_query_ali_2>[ ]+      BEGIN(s_query_end);
+<s_query_end>[0-9]+      RET(s_query_end_2,yytext,QUERY_END);
+
+%{
+/* handle this:
+Query: 0   -------                                                      
+
+Sbjct: 897 TPGAYGG                                                      903 
+
+   handle this:
+[...]
+Query: 0                                                               
+                                                                       
+Sbjct: 28                                                               28
+[...]
+*/
+%}
+<s_query_ali,s_query_end,s_query_end_2>\n.+\n"Sbjct:"" "+ BEGIN(s_subject_start);
 
+<s_subject_start>[0-9]+         RET(s_subject_start_2,yytext,SUBJECT_START);
+<s_subject_start_2>[ ]+         BEGIN(s_subject_ali);
+<s_subject_ali>[[:alpha:]-]+    RET(s_subject_ali_2,yytext,SUBJECT_ALI);
+<s_subject_ali_2>[ ]+   BEGIN(s_subject_end);
+%{
+/* handle this:
+[...]
+Query: 0                                                               
+                                                                       
+Sbjct: 28                                                               28
+[...]
+*/
+%}
+<s_subject_ali,s_subject_end>[0-9]+           RET(s_subject_end_2,yytext,SUBJECT_END);
 <s_subject_end_2>\n+            BEGIN(s_hsp);
 
 
-<s_pre_hits,s_pre_end,s_hsp>[ \t\n]
+<s_pre_hits,s_pre_end>[ \t\n]
 
    
-<s_pre_hits,s_pre_end>Database:      RET(s_end_of_report,"",END_OF_REPORT);
-<s_pre_hits,s_pre_end>T?BLAST.         {
-       yyless(0); 
-       RET(s_end_of_report,"",END_OF_REPORT);
- }
-
+<s_pre_hits,s_pre_end>Database: {
+    yyless(0);
+    begin_tail_of_report(s_tail_of_report);
+  }
 
-<s_hsp>Database:  { unput('~'); BEGIN(s_pre_end_of_report_1); }
-  
+<s_tail_of_report>T?BLAST.         {
+    yyless(0); 
+    RET(s_pre_end,_zerg_internal_buffer,TAIL_OF_REPORT);
+  }
 
-<s_hsp>T?BLAST.    {
+<s_pre_hits,s_pre_end,s_hsp>T?BLAST.         {
        yyless(0); 
-       unput('~');
-       BEGIN(s_pre_end_of_report_1);
+       RET(INITIAL,"",END_OF_REPORT);
  }
 
-
-<s_pre_end_of_report_1>"~"  {
-     if(_zerg_have_hsp)
-     {
-       _zerg_have_hsp=0;
-       unput('~');
-       unput('~');
-       RET(s_pre_end_of_report_2,_zerg_internal_buffer,QUERY_END);
-     }
-     else
-     {
-       RET(s_end_of_report,"",END_OF_REPORT);
-     }
+<s_hsp>"  Database:"  {
+    yyless(0);
+    begin_tail_of_report(s_tail_of_report);
   }
 
-<s_pre_end_of_report_2>"~" RET(s_pre_end_of_report_3,_zerg_internal_buffer+32,SUBJECT_END);
-<s_pre_end_of_report_3>"~" RET(s_end_of_report,"",END_OF_REPORT);
+<s_tail_of_report>.|\n {
+    if( _zerg_tail_of_rep_len < INTERNAL_BUFFER_LENGTH-1 )
+    {
+      _zerg_internal_buffer[_zerg_tail_of_rep_len++] = *yytext;
+      _zerg_internal_buffer[_zerg_tail_of_rep_len] = 0;
+    }
+    else 
+      fprintf(stderr, "zerg buffer overflow, can not store '%c'\n", *yytext );
+    //BEGIN(s_tail_of_report); // it already is int this start cond
+  }
 
-<s_end_of_report>.|\n   
 <*>.|\n               RET(INITIAL,yytext,UNMATCHED);
-<INITIAL,s_end_of_report><<EOF>>    _zerg_value=""; return 0;
+
+<s_tail_of_report><<EOF>>   {
+    RET(INITIAL,_zerg_internal_buffer,TAIL_OF_REPORT);
+  }
+
+<INITIAL><<EOF>> {
+    _zerg_value=""; return 0;
+  }
 
 %%
 
 
 void zerg_open_file(char* filename)
 {
-  zergin=fopen(filename,"r");
+  FILE * fh;
+
+  fh = fopen(filename,"r");
+
+  if( !fh ) zergrestart( stdin );
+  else zergrestart( fh );
+  BEGIN(INITIAL);
+}
+
+void zerg_read_stream(FILE* __stream)
+{
+  zergrestart( __stream );
+  BEGIN(INITIAL);
 }
 
 void zerg_close_file()
@@ -516,7 +514,7 @@
 void zerg_ignore_all()
 {
   int i;
-  for(i=BLAST_VERSION; i<=UNMATCHED; i++)
+  for(i=BLAST_VERSION; i<AFTER_LAST_TOKEN; i++)
     _zerg_ignore[i]=1;
 }
 
@@ -528,7 +526,7 @@
 void zerg_unignore_all()
 {
   int i;
-  for(i=BLAST_VERSION; i<=UNMATCHED; i++)
+  for(i=BLAST_VERSION; i<AFTER_LAST_TOKEN; i++)
     _zerg_ignore[i]=0;
 }
 
@@ -548,3 +546,5 @@
   else
     return pos - YY_CURRENT_BUFFER->yy_n_chars + yytext -YY_CURRENT_BUFFER->yy_ch_buf - _zerg_desconto;
 }
+
+// vim:et:ts=2:
--- libzerg.orig/zerg.h
+++ libzerg/zerg.h
@@ -6,37 +6,51 @@
 #ifndef _zerg_h_
 #define _zerg_h_
 
-#define BLAST_VERSION 1   /* Esta tem que ser a primeira */
-#define QUERY_NAME 2
-#define QUERY_ANNOTATION 3
-#define QUERY_LENGTH 4
-#define NOHITS 5
-#define DESCRIPTION_HITNAME 6
-#define DESCRIPTION_ANNOTATION 7
-#define DESCRIPTION_SCORE 8
-#define DESCRIPTION_EVALUE 9
-#define SUBJECT_NAME 10
-#define SUBJECT_ANNOTATION 11
-#define SUBJECT_LENGTH 12
-#define SCORE_BITS 13
-#define SCORE 14
-#define EVALUE 15
-#define IDENTITIES 16
-#define ALIGNMENT_LENGTH 17
-#define PERCENT_IDENTITIES 18
-#define GAPS 19
-#define QUERY_ORIENTATION 20
-#define SUBJECT_ORIENTATION 21
-#define QUERY_START 22
-#define QUERY_END 23
-#define SUBJECT_START 24
-#define SUBJECT_END 25
-#define END_OF_REPORT 26
-#define POSITIVES 27
-#define PERCENT_POSITIVES 28
-#define QUERY_FRAME 29
-#define SUBJECT_FRAME 30
-#define UNMATCHED 31    /* Esta tem que ser a ultima */
+#define BUFFER_OVERFLOW		-1
+
+#define BLAST_VERSION 		1   /* Esta tem que ser a primeira */
+#define QUERY_NAME 		2
+#define QUERY_ANNOTATION 	3
+#define QUERY_LENGTH 		4
+#define NOHITS 			5
+#define DESCRIPTION_HITNAME 	6
+#define DESCRIPTION_ANNOTATION 	7
+#define DESCRIPTION_SCORE 	8
+#define DESCRIPTION_EVALUE 	9
+#define SUBJECT_NAME 		10
+#define SUBJECT_ANNOTATION 	11
+#define SUBJECT_LENGTH 		12
+#define SCORE_BITS 		13
+#define SCORE 			14
+#define EVALUE 			15
+#define IDENTITIES 		16
+#define ALIGNMENT_LENGTH 	17
+#define PERCENT_IDENTITIES 	18
+#define GAPS 			19
+#define QUERY_ORIENTATION 	20
+#define SUBJECT_ORIENTATION 	21
+#define QUERY_START 		22
+#define QUERY_END 		23
+#define SUBJECT_START 		24
+#define SUBJECT_END 		25
+#define END_OF_REPORT 		26
+#define POSITIVES 		27
+#define PERCENT_POSITIVES 	28
+#define QUERY_FRAME 		29
+#define SUBJECT_FRAME 		30
+#define UNMATCHED 		31
+#define ROUND_NUMBER 		32
+#define HSP_METHOD 		33
+#define SEARCHING 		34
+#define QUERY_ALI 		35
+#define SUBJECT_ALI 		36
+#define ROUND_SEQ_FOUND 	37
+#define ROUND_SEQ_NEW 		38
+#define CONVERGED 		39
+#define REFERENCE 		40
+#define TAIL_OF_REPORT 		41
+#define DATABASE 		42
+#define AFTER_LAST_TOKEN 	43			// dummy token, must be last
 
 #ifdef __cplusplus
 extern "C" {
@@ -44,6 +58,7 @@
 
 void zerg_open_file(char* filename);
 void zerg_close_file();
+void zerg_read_stream(FILE* __stream);
 void zerg_ignore(int code);
 void zerg_ignore_all();
 void zerg_unignore(int code);
--- libzerg.orig/Makefile.am
+++ libzerg/Makefile.am
@@ -4,7 +4,7 @@
 
 lib_LTLIBRARIES = libzerg.la
 libzerg_la_SOURCES = zerg.h zerglexer.c
-libzerg_la_LDFLAGS = -version-info 0:0:0
+libzerg_la_LDFLAGS = -version-info 0:1:0
 
 lex.zerg.c: zerglexer.lex
 	flex -Cf zerglexer.lex
