ENGINE-95 intermittent commit - header field parsing peg-mime
authorKrista Grothoff <krista@pep-project.org>
Tue, 20 Sep 2016 09:57:22 +0200
branchpeg-mime
changeset 1181d5dcb3df9ef6
parent 1180 17c88a029958
child 1182 179d3ba1053d
ENGINE-95 intermittent commit - header field parsing
mime-parser/mailmime.peg
src/mailmime.c
src/mailmime.h
     1.1 --- a/mime-parser/mailmime.peg	Mon Sep 19 02:45:42 2016 +0200
     1.2 +++ b/mime-parser/mailmime.peg	Tue Sep 20 09:57:22 2016 +0200
     1.3 @@ -1,19 +1,15 @@
     1.4  Mail                    <-      ( Header )+ CRLF ( Body )? EOF
     1.5  
     1.6 -# As pretty as it would be to make the header parse elegant, 
     1.7 -# the parse semantics work better this way.
     1.8 -# Marginally, a header is HeaderKey COLON HeaderValue (with some
     1.9 -# lines possible afterwards), but we have some specific headers that
    1.10 -# have meaning, so they are explicit here.
    1.11 -Header                  <-      < MIMEHeader > { printf("MIMEHeader! %s\n", yytext); } / 
    1.12 +Header                  <-     ( < MIMEHeader > { printf("MIMEHeader! %s\n", yytext); } / 
    1.13                                  < MessageHeader > { printf("MessageHeader!%s\n", yytext); } / 
    1.14 -                                < GenericHeader > { printf("GenericHeader!%s\n", yytext); }
    1.15 +                                < GenericHeader > { printf("GenericHeader!%s\n", yytext); } )
    1.16  
    1.17  GenericHeader           <-      HeaderKey COLON HeaderText
    1.18  HeaderKey               <-      ( Alpha / Digit / OrdinarySymbol )+
    1.19 +HeaderPhrase            <-      EncodedWord / Word
    1.20  HeaderText              <-      HeaderLine ( HeaderCont )*
    1.21  HeaderCont              <-      Whitespace HeaderLine
    1.22 -HeaderLine              <-      Text* CRLF
    1.23 +HeaderLine              <-      HeaderPhrase* CRLF
    1.24  
    1.25  #MIME Header Blocks
    1.26  
    1.27 @@ -62,7 +58,7 @@
    1.28  MIMEContentDescKey      <-      'Content-Description'
    1.29  MIMEExtensionKey        <-      'Content-'HeaderKey
    1.30  
    1.31 -#VersionString           <-      Text* Digit Text* "." Text* Digit Text* 
    1.32 +#VersionString           <-     Text* Digit Text* "." Text* Digit Text* 
    1.33  VersionString           <-      HeaderText
    1.34  ContentTypeString       <-      ContentType "/" ContentSubtype (";" Parameter)*
    1.35  ContentDispoString      <-      HeaderText
    1.36 @@ -85,15 +81,31 @@
    1.37  IETFToken               <-      Text
    1.38  ContentSubtype          <-      ExtensionToken / IANAToken
    1.39  IANAToken               <-      Text
    1.40 -Parameter               <-      Attribute "=" Value #attribute matching always case insensitive
    1.41 -Attribute               <-      Token
    1.42 +
    1.43 +Parameter               <-      RegularParameter / ExtendedParameter
    1.44 +RegularParameter        <-      RegularParameterName "=" Value
    1.45 +RegularParameterName    <-      Attribute Section?
    1.46  Value                   <-      Token / QuotedString
    1.47  Token                   <-      SymbolNoTSpecials+
    1.48 +Attribute               <-      AttributeChar+
    1.49 +AttributeChar           <-      (![NonAttributeChar] ASCII_Printable)
    1.50 +NonAttributeChar        <-      Space / "*" / "'" / "%" / TSpecials
    1.51 +Section                 <-      InitialSection / OtherSections
    1.52 +InitialSection          <-      "*0"
    1.53 +OtherSections           <-      "*" ("1" / "2" / "3" / "4" / "5" / "6" / "7" / "8" / "9") Digit*
    1.54 +ExtendedParameter       <-      (ExtendedInitialName "=" ExtendedInitialValue) /
    1.55 +                                (ExtendedOtherNames "="ExtendedOtherValues)
    1.56 +ExtendedInitialName     <-      Attribute InitialSection? "*"
    1.57 +ExtendedOtherNames      <-      Attribute OtherSections "*"
    1.58 +ExtendedInitialValue    <-      [Charset] "'" [Language] "'" ExtendedOtherValues
    1.59 +ExtendedOtherValues     <-      (ExtOctet / AttributeChar)*
    1.60 +ExtOctet                <-      "%" HexDigit HexDigit
    1.61 +
    1.62  
    1.63  
    1.64  
    1.65  # Message-related headers (per RFC2822)
    1.66 -MessageHeader           <-      MessageIDHeader /
    1.67 +MessageHeader           <-     ( MessageIDHeader /
    1.68                                  MessageToHeader /
    1.69                                  MessageFromHeader /
    1.70                                  MessageCCHeader /
    1.71 @@ -101,14 +113,14 @@
    1.72                                  MessageSubjHeader /
    1.73                                  MessageReplyToHeader /
    1.74                                  MessageInReplyToHeader /
    1.75 -                                MessageRefsHeader      
    1.76 +                                MessageRefsHeader ) CRLF      
    1.77                                  
    1.78  MessageIDHeader         <-      MessageIDKey COLON IDString
    1.79  MessageToHeader         <-      MessageToKey COLON < ToString >     {printf("To: %s\n", yytext, stderr);}
    1.80  MessageFromHeader       <-      MessageFromKey COLON FromString
    1.81  MessageCCHeader         <-      MessageCCKey COLON CCString
    1.82  MessageBCCHeader        <-      MessageBCCKey COLON BCCString
    1.83 -MessageSubjHeader       <-      MessageSubjKey COLON SubjString
    1.84 +MessageSubjHeader       <-      MessageSubjKey COLON Whitespace* SubjString
    1.85  MessageReplyToHeader    <-      MessageReplyToKey COLON ReplyToString
    1.86  MessageInReplyToHeader  <-      MessageInReplyToKey COLON InReplyToString
    1.87  MessageRefsHeader       <-      MessageRefsKey COLON RefsString
    1.88 @@ -121,20 +133,57 @@
    1.89  MessageReplyToKey       <-      'Reply-To'
    1.90  MessageInReplyToKey     <-      'In-Reply-To'
    1.91  MessageRefsKey          <-      'References'  
    1.92 -IDString                <-      HeaderText
    1.93 -ToString                <-      HeaderText
    1.94 -FromString              <-      HeaderText
    1.95 -CCString                <-      HeaderText
    1.96 -BCCString               <-      HeaderText
    1.97 +IDString                <-      MsgID CRLF
    1.98 +ToString                <-      AddressList
    1.99 +FromString              <-      MailboxList
   1.100 +CCString                <-      AddressList
   1.101 +BCCString               <-      AddressList
   1.102  SubjString              <-      HeaderText
   1.103 -ReplyToString           <-      HeaderText
   1.104 -InReplyToString         <-      HeaderText
   1.105 -RefsString              <-      HeaderText
   1.106 +ReplyToString           <-      AddressList
   1.107 +InReplyToString         <-      MsgID+
   1.108 +RefsString              <-      MsgID+
   1.109  
   1.110  
   1.111 -                        
   1.112 -Body                <-      PlainBody
   1.113 -PlainBody           <-      .* EOF
   1.114 +# RFC2822 Address Specification
   1.115 +Address                 <-      Mailbox / Group
   1.116 +Mailbox                 <-      NameAddr / AddrSpec
   1.117 +NameAddr                <-      DisplayName? AngleAddr
   1.118 +AngleAddr               <-      CFWS? "<" AddrSpec ">" CFWS? / ObsAngleAddr
   1.119 +Group                   <-      DisplayName ":" (MailboxList / CFWS)? ";" CFWS?
   1.120 +DisplayName             <-      Phrase
   1.121 +MailboxList             <-      (Mailbox ("," Mailbox)*) / ObsMBoxList
   1.122 +AddressList             <-      (Address ("," Address)*) / ObsAddrList
   1.123 +
   1.124 +AddrSpec                <-      LocalPart "@" Domain
   1.125 +LocalPart               <-      DotAtom / QuotedString / ObsLocalPart
   1.126 +Domain                  <-      DotAtom / DomainLiteral / ObsDomain
   1.127 +DomainLiteral           <-      CFWS? "[" (FoldingWhiteSpace? DContent)* FoldingWhiteSpace? "]" CFWS?
   1.128 +DContent                <-      DText / QuotedPair
   1.129 +DText                   <-      NoWSCtl /
   1.130 +                                [\041-\132] /
   1.131 +                                [\136-\176]
   1.132 +
   1.133 +# Supported Obsolete Addressing
   1.134 +ObsAngleAddr            <-      CFWS? "<" ObsRoute? AddrSpec ">" CFWS?
   1.135 +ObsRoute                <-      CFWS? ObsDomainList ":" CFWS?
   1.136 +ObsDomainList           <-      "@" Domain ((CFWS / "," )* CFWS? "@" Domain)*
   1.137 +ObsLocalPart            <-      RFCWord ("." RFCWord)*
   1.138 +ObsDomain               <-      Atom ("." Atom)*
   1.139 +ObsMBoxList             <-      (Mailbox? CFWS? "," CFWS?)+ Mailbox?
   1.140 +ObsAddrList             <-      (Address? CFWS? "," CFWS?)+ Address?
   1.141 +                      
   1.142 +                      
   1.143 +# RFC2822 Message ID
   1.144 +MsgID                   <-       CFWS? "<" IDLeft "@" IDRight ">" CFWS?
   1.145 +IDLeft                  <-       DotAtomText / NoFoldQuote / ObsIDLeft
   1.146 +IDRight                 <-       DotAtomText / NoFoldLiteral / ObsIDRight
   1.147 +NoFoldQuote             <-       "\"" *(QText / QuotedPair) "\""
   1.148 +NoFoldLiteral           <-       "[" *(DText / QuotedPair) "]"
   1.149 +ObsIDLeft               <-      LocalPart
   1.150 +ObsIDRight              <-      Domain
   1.151 +                      
   1.152 +Body                    <-      PlainBody
   1.153 +PlainBody               <-      .* EOF
   1.154  
   1.155  Encapsulation       <-      Delimiter BodyPart CRLF
   1.156  CloseDelimiter      <-      Delimiter "--"
   1.157 @@ -162,7 +211,9 @@
   1.158  QText               <-      !["\"" "\\" CR]
   1.159  QuotedPair          <-      "\\". / "\\" Text
   1.160  
   1.161 -Text                <-      ( Alpha / Digit / Symbol / Space )+
   1.162 +Text                <-      ( Word / Space )+
   1.163 +Word                <-      ( AlphaNum / Symbol )
   1.164 +AlphaNum            <-      Alpha / Digit
   1.165  Alpha               <-      [a-zA-Z]
   1.166  Digit               <-      [0-9]
   1.167  Symbol              <-      OrdinarySymbol / COLON
   1.168 @@ -173,10 +224,93 @@
   1.169  SymbolNoTSpecials   <-      ["!" "#" "$" "%" "&" "'" "*" "+" "-" "." "^" "_" "`" "{" "|" "}" "~"]
   1.170  CommentSymbols      <-      [\041-\047] / [\52-57] / [\072-\100] / [\133] / [\135-\140] / [\173-177]
   1.171  
   1.172 +TSpecials           <-      !["."] (Specials / ["/" "?" "="])
   1.173 +Specials            <-      ["(" ")" "<" ">" "@" "," ";" ":" "\\" "\"" "." "\[" "\]"]
   1.174 +
   1.175 +ASCII_CHAR          <-      [\000-\127]
   1.176 +ASCII_Printable     <-      [\041-\176]
   1.177 +ASCII_CTL           <-      [\000-\031] / [\127]
   1.178 +LWS                 <-      CRLF? Space+
   1.179 +
   1.180 +
   1.181 +
   1.182  OCTET               <-      '\\' (([0-1] [0-9] [0-9]) / ("2" (([0-4] [0-9]) / ("5" [0-5])))) 
   1.183  COLON               <-      ':'
   1.184  NEWLINE             <-      '\n'
   1.185  CR                  <-      '\r'
   1.186  CRLF                <-      CR CR? NEWLINE 
   1.187  EOF                 <-      !.
   1.188 -NoWSCtl             <-      [\001-\010] / [\013-\014] / [\016-\037] / [\177]
   1.189 \ No newline at end of file
   1.190 +NoWSCtl             <-      [\001-\010] / [\013-\014] / [\016-\037] / [\177]
   1.191 +
   1.192 +
   1.193 +HexDigit                <-      (Digit / "A" / "B" / "C" / "D" / "E" / "F")
   1.194 +
   1.195 +EncodedWord             <-      <"=?" Charset ("*" Language)? "?" EncodedText "?="> { printf("V1%s\n", yytext); } /
   1.196 +                                <"=?" Charset "?" Encoding "?" EncodedText "?="> { printf("V2%s\n", yytext); }
   1.197 +                                
   1.198 +Encoding                <-      [Q q] / [B b]
   1.199 +
   1.200 +Charset                 <-      "utf-8" /
   1.201 +                                "US-ASCII" /
   1.202 +                                "Big5" /
   1.203 +                                "EUC-JP" /
   1.204 +                                "EUC-KR" /
   1.205 +                                "GB2312" /
   1.206 +                                "ISO-2022-JP" /
   1.207 +                                "ISO-2022-JP-2" /
   1.208 +                                "ISO-2022-KR" /
   1.209 +                                "ISO-8859-1" /
   1.210 +                                "ISO-8859-10" /
   1.211 +                                "ISO-8859-2" /
   1.212 +                                "ISO-8859-3" /
   1.213 +                                "ISO-8859-4" /
   1.214 +                                "ISO-8859-5" /
   1.215 +                                "ISO-8859-6" /
   1.216 +                                "ISO-8859-6-E" /
   1.217 +                                "ISO-8859-6-I" /
   1.218 +                                "ISO-8859-7" /
   1.219 +                                "ISO-8859-8" /
   1.220 +                                "ISO-8859-8-E" /
   1.221 +                                "ISO-8859-8-I" /
   1.222 +                                "ISO-8859-9" /
   1.223 +                                "KOI8-R" /
   1.224 +                                "Shift_JIS"
   1.225 +
   1.226 +Language                <-      PrimarySubtag ("-" Subtag)*
   1.227 +EncodedText             <-      (!["?"] ( Alpha / Digit / Symbol ))+
   1.228 +PrimarySubtag           <-      Alpha Alpha? Alpha? Alpha? Alpha? Alpha? Alpha? Alpha?
   1.229 +Subtag                  <-      AlphaNum AlphaNum? AlphaNum? AlphaNum? AlphaNum? AlphaNum? AlphaNum? AlphaNum?
   1.230 +                            
   1.231 +# May need to be updated - this is ancient
   1.232 +UTF8Octets          <-      UTF8Char*
   1.233 +UTF8Char            <-      UTF8_1 / UTF8_2 / UTF8_3 / UTF8_4
   1.234 +UTF8_1              <-      [\000-\177]
   1.235 +UTF8_2              <-      [\302-\337] UTF8Tail
   1.236 +UTF8_3              <-      [\340] [\240-\277] UTF8Tail / 
   1.237 +                            [\341-\354] UTF8Tail UTF8Tail /
   1.238 +                            [\355] [\200-\237] UTF8Tail / 
   1.239 +                            [\356-\357] UTF8Tail UTF8Tail
   1.240 +UTF8_4              <-      [\360] [\220-\277] UTF8Tail UTF8Tail / [\361-\363] UTF8Tail UTF8Tail UTF8Tail /
   1.241 +                            [\364] [\200-\217] UTF8Tail UTF8Tail
   1.242 +UTF8Tail            <-      [\200-\277]
   1.243 +
   1.244 +RFCWord             <-      Atom / QuotedString
   1.245 +Phrase           <-      RFCWord+ / ObsPhrase
   1.246 +ObsPhrase           <-      RFCWord (RFCWord / "." / CFWS)*
   1.247 +
   1.248 +
   1.249 +# Atom (2822 - mostly for supported obsolete stuff)
   1.250 +AText               <-      Alpha / Digit / 
   1.251 +                            "!" / "#" /   
   1.252 +                            "$" / "%" /   
   1.253 +                            "&" / "'" /
   1.254 +                            "*" / "+" /
   1.255 +                            "-" / "/" /
   1.256 +                            "=" / "?" /
   1.257 +                            "^" / "_" /
   1.258 +                            "`" / "{" /
   1.259 +                            "|" / "}" /
   1.260 +                            "~"
   1.261 +Atom                <-       CFWS? AText+ CFWS?
   1.262 +DotAtom             <-       CFWS? DotAtomText CFWS?
   1.263 +DotAtomText         <-       AText+ ("." AText+)*
     2.1 --- a/src/mailmime.c	Mon Sep 19 02:45:42 2016 +0200
     2.2 +++ b/src/mailmime.c	Tue Sep 20 09:57:22 2016 +0200
     2.3 @@ -17,11 +17,14 @@
     2.4          return PEP_ILLEGAL_VALUE;
     2.5      
     2.6      *msg = NULL;
     2.7 + 
     2.8 +    _msg = new_message(PEP_dir_incoming);
     2.9      
    2.10      yycontext ctx;
    2.11      memset(&ctx, 0, sizeof(yycontext));
    2.12      ctx.input_str = mimetext;
    2.13      ctx.index_consumed = 0;
    2.14 +    ctx.parsed_msg = &_msg;
    2.15      yyparse(&ctx);
    2.16  
    2.17      return PEP_STATUS_OK;
     3.1 --- a/src/mailmime.h	Mon Sep 19 02:45:42 2016 +0200
     3.2 +++ b/src/mailmime.h	Tue Sep 20 09:57:22 2016 +0200
     3.3 @@ -12,7 +12,10 @@
     3.4  
     3.5  #define YY_DEBUG
     3.6  #define YY_CTX_LOCAL
     3.7 -#define YY_CTX_MEMBERS const char*   input_str; size_t index_consumed; 
     3.8 +#define YY_CTX_MEMBERS const char* input_str;    \
     3.9 +                       size_t index_consumed;    \
    3.10 +                       message* parsed_msg;     \
    3.11 +                       pEp_mailmime* msg_root;   
    3.12  
    3.13  #define YY_INPUT(yycontext, buf, result, max_size)                        \
    3.14  {                                                                         \