-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpm-zh-decode.rc
211 lines (181 loc) · 5.84 KB
/
pm-zh-decode.rc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# set syntax=procmail
#
# Notes
#
# You should install the following binary tools in your system:
# procmail (Obviously!)
# formail (usually comes with procmail)
# mmencode (may be ``mimencode'' in some system)
# w3m (with M17N support)
# piconv (iconv written in Perl, you can use ``iconv'' instead)
#
# Documentation
#
# If your system do not have ``w3m'', but you still want to use this
# module digressively, you can set the PM_ZH_USE_W3M to disable
# ``w3m'', but words in mail body that encoded in url-encode or escaped
# hexadecimal will not be processed.
#
# You can disable w3m processing by putting the following line before
# the INCLUDERC setting:
#
# PM_ZH_USE_W3M = "no"
#
# All the return values described below are encoded in UTF-8, so I
# recommend you also save your .procmailrc in UTF-8 format.
#
# Required settings
#
# In your .procmailrc config file, make sure the binary tool mentioned
# above can be found in the ${PATH} environment.
#
# Call arguments (variables to set before calling)
#
# (none)
#
# Return values
#
# o PM_ZH_SUBJECT contains the Subject string from the header in UTF-8
# o PM_ZH_STRIP_SUBJECT contains the Subject in UTF-8 without fake
# characters.
# o PM_ZH_FROM contains the From string from the header in UTF-8,
# usually contains sender's name and e-mail address.
# o PM_ZH_BODY contains the mail body content in UTF-8
# o E_CHARSET contains the charset name used to decoded the mail body
# o E_ENCODING contains the encoded name used to decoded the mail body
# o HAS_HTML_CONTENT is "yes" if there is text/html section in mail
# body.
# o E_HTML_CHARSET contains the charset name used to decoded the
# text/html section in mail body.
#
# Examples
#
# PATH = /bin:/usr/bin:/usr/local/bin
# # if you don't have ``w3m'', uncomment the following line
# # PM_ZH_USE_W3M = "no"
# INCLUDERC = /absolute/path/to/pm-zh-decode.rc
#
# # filter the mail to 'spam' folder if mail body has any one of the
# # string ['賺錢', '貸款', '免利息', '直銷']
# :0:
# * PM_ZH_BODY ?? (賺錢|\
# 貸款|\
# 免利息|\
# 直銷)
# spam
#
# Known issues
#
# There is some problem when procmail do long string processing in
# back-quotes(`), so we can just do the extracting command in one time.
#
# Environment variable assignment may sometime not work normally caused
# by procmail either.
#
# Change Log
#
# 2012-05-04
# Create module.
#
# 2012-05-07
# Rewrite parsing procedure, use w3m.
#
#
#
# Program Settings
# ===========================================
FORMAIL = ${FORMAIL:-'formail'}
MME_QP = ${MME_QP:-'mmencode -u -q'} # decode Quoted printable
MME_B64 = ${MME_B64:-'mmencode -u -b'} # decode base64
CONV_B2U = ${CONV_B2U:-'piconv -f big5 -t utf8'}
CONV_G2U = ${CONV_G2U:-'piconv -f gb2312 -t utf8'}
# bumped the LINEBUF up to be large enough to hold string
LINEBUF = 4096
#
# Extract Fields
# ===========================================
# decoding subject, sender
PM_ZH_SUBJECT = `$FORMAIL -cxSubject: | perl -MEncode -ne 'print encode("utf8", decode("MIME-Header", $_));'`
PM_ZH_FROM = `$FORMAIL -cxFrom: | perl -MEncode -ne 'print encode("utf8", decode("MIME-Header", $_));'`
# strip special characters
# long string reuse may failed, so we do it again. :~
PM_ZH_STRIP_SUBJECT = `$FORMAIL -cxSubject: | perl -MEncode -ne 'print encode("utf8", decode("MIME-Header", $_));' | sed 's/'"[-+~^|#\@!& ',;:?|.=_(){}<>\/*%0-9]"/'''/g' | sed 's/\[//g' | sed 's/]//g' | sed 's/"//g'`
PM_ZH_BODY = ""
#
# Charset, Encoding pasing
# ===========================================
# Now, we want to test if mail BODY has any Charset, Encoding settings...
# default charset, encoding
E_CHARSET = "iso-8859-1"
E_ENCODING = "7-bit"
HAS_HTML_CONTENT = "no"
:0
* HB ?? ^Content-Type:
{
# CHARSET
:0
*$ B ?? ^(Content-Type:.*| )charset *= *\"?\/[^\";]+
{ E_CHARSET = $MATCH }
:0 E
*$ H ?? ^(Content-Type:.*| )charset *= *\"?\/[^\";]+
{ E_CHARSET = $MATCH }
# ENCODING
:0
*$ B ?? ^Content-Transfer-Encoding: +\/.+
{ E_ENCODING = $MATCH }
:0 E
*$ H ?? ^Content-Transfer-Encoding: +\/.+
{ E_ENCODING = $MATCH }
# html content processing
:0
* B ?? ^Content-Type: *text/html;^?( | )*charset *= *\"?\/[^\";]+
{
HAS_HTML_CONTENT = "yes"
E_HTML_CHARSET = $MATCH
}
}
#
# Decoding method assignment
# ===========================================
:0
{
# default filter
F_CHARSET = 'cat'
F_ENCODING = 'cat'
F_HTML = 'cat'
# CHARSET
:0
* E_CHARSET ?? big-?5
{ F_CHARSET = $CONV_B2U }
:0 E
* E_CHARSET ?? gb2312-?.*
{ F_CHARSET = $CONV_G2U }
# ENCODING
:0
* E_ENCODING ?? quoted-printable
{ F_ENCODING = $MME_QP }
:0 E
* E_ENCODING ?? base64
{ F_ENCODING = $MME_B64 }
# html content processing
:0
* ! PM_ZH_USE_W3M ?? ^^no^^
* HAS_HTML_CONTENT ?? ^^yes^^
{
:0
* E_HTML_CHARSET ?? big-?5
{ F_HTML = 'w3m -I big5 -O big5 -T text/html' }
:0 E
* E_HTML_CHARSET ?? gb2312-?.*
{ F_HTML = 'w3m -I gb2312 -O gb2312 -T text/html' }
:0 E
* E_HTML_CHARSET ?? utf-?8
{ F_HTML = 'w3m -I utf8 -O utf8 -T text/html' }
}
}
#
# After parsing the mail BODY, we can remove any MIME header from it
# We extract the mail BODY here
# NOTE: any space is NOT allowed between ``variable'' and ``='' here, it's a special usage excluding environment variable assignmnet
:0 fW
PM_ZH_BODY= | $FORMAIL -I "" | awk ' ! /^--[a-zA-Z0-9\-="._:]+$/ && ! /^Content-[DT]/ && ! /^\tcharset=/ && ! /^\tboundary=/ {print}' | $F_ENCODING | $F_HTML | $F_CHARSET | tr -d \\n