-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsaveweb-search-backend.py
295 lines (247 loc) · 9.73 KB
/
saveweb-search-backend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
from html import unescape as html_unescape
from datetime import datetime, timezone
from functools import wraps
import asyncio
import os
import time
from fastapi import FastAPI, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse, JSONResponse
import meilisearch_python_sdk
import meilisearch_python_sdk.errors
MEILI_KEY = os.getenv('MEILI_KEY', '')
print('$MEILI_KEY', 'set' if MEILI_KEY else 'not set')
MEILI_URL = os.getenv('MEILI_URL', 'http://127.0.0.1:7700')
print('$MEILI_URL', MEILI_URL)
STWP_SEARCH_MAX_LOAD = float(os.getenv('STWP_SEARCH_MAX_LOAD')) if os.getenv('STWP_SEARCH_MAX_LOAD') else (
os.cpu_count() / 1.5 if os.cpu_count() else 1.5
)
print('$STWP_SEARCH_MAX_LOAD', STWP_SEARCH_MAX_LOAD)
STWP_SEARCH_MAX_FLYING_OPS = int(os.getenv('STWP_SEARCH_MAX_FLYING_OPS')) if os.getenv('STWP_SEARCH_MAX_FLYING_OPS') else (
int(STWP_SEARCH_MAX_LOAD * 2)
)
STWP_SEARCH_MAX_FLYING_OPS = STWP_SEARCH_MAX_FLYING_OPS if STWP_SEARCH_MAX_FLYING_OPS >= 1 else 1
print('$STWP_SEARCH_MAX_FLYING_OPS', STWP_SEARCH_MAX_FLYING_OPS)
STWP_SEARCH_CORS = os.getenv('STWP_SEARCH_CORS', ','.join([
# 'https://search.saveweb.org',
'*'
]))
print('$STWP_SEARCH_CORS', STWP_SEARCH_CORS)
app = FastAPI()
# set CORS
app.add_middleware(
CORSMiddleware,
allow_origins=STWP_SEARCH_CORS.split(','),
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
INDEX_NAME = "entry"
async def get_load():
with open('/proc/loadavg', 'r') as f:
load = f.read().split()[0]
return float(load)
def load_limiter(func):
@wraps(func)
async def wrapper(*args, **kwargs):
if await get_load() > STWP_SEARCH_MAX_LOAD:
print('[INFO] 荷载过高')
return JSONResponse({
'hits': [
{
'title': '丑搜当前荷载过高,请稍后再试',
'content': '服务器荷载过高,请稍后再试。原因:1. 数据库正在更新全文索引 2. 服务器没有摸鱼,在干其它重荷载的任务',
'author': ';丑搜',
'date': int(time.time()),
'link': '#',
},
],
'error': '丑搜当前荷载过高,请稍后再试',
}, headers={'Retry-After': '30'})
return await func(*args, **kwargs)
return wrapper
flying_ops = 0
def ops_limiter(func):
@wraps(func)
async def wrapper(*args, **kwargs):
global flying_ops
if flying_ops >= STWP_SEARCH_MAX_FLYING_OPS:
print('[INFO] 操作过多')
return JSONResponse({
'hits': [
{
'title': '飞行中的搜索过多,请稍后再试',
'content': '同一时间内的搜索请求过多。请稍后再试。',
'author': ';丑搜',
'date': int(time.time()),
'link': '#',
},
],
'error': '操作过多,请稍后再试',
}, status_code=503, headers={'Retry-After': '30'})
flying_ops += 1
try:
return await func(*args, **kwargs)
finally:
flying_ops -= 1
return wrapper
def magic_date_filter(_filter: str) -> str:
for args in [
('sec(',')', 'sec'),
('us(',')', 'us'),
]:
_filter = _magic_date_filter(_filter, args)
return _filter
def _magic_date_filter(_filter: str, args: tuple[str, str, str]) -> str:
start_tag,end_tag,mode = args
left_at = _filter.find(start_tag)
right_at = -1
if left_at != -1:
right_at = left_at + _filter[left_at:].find(end_tag)
if left_at != -1 and right_at != -1 and left_at < right_at:
_date = _filter[left_at + len(start_tag):right_at]
try:
if mode == 'sec':
epoch = datetime.strptime(_date, '%Y-%m-%d').replace(tzinfo=timezone.utc).timestamp()
elif mode == 'us':
epoch = datetime.strptime(_date, '%Y-%m-%d').replace(tzinfo=timezone.utc).timestamp() * 1000000
else:
raise ValueError('mode not supported')
_filter = _filter[:left_at] + str(int(epoch)) + _filter[right_at + len(end_tag):]
return _magic_date_filter(_filter, args)
except Exception as e:
print('date_magic_filter error:', e)
return _filter
client = meilisearch_python_sdk.AsyncClient(MEILI_URL, MEILI_KEY)
@app.get('/api/')
async def go_back_home():
# redirect to /
return Response(status_code=302, headers={'Location': '/'})
@app.get('/api/entry/{entry_id}')
@load_limiter
@ops_limiter
async def article(entry_id: int):
results = {}
results['data'] = await client.index(INDEX_NAME).get_document(entry_id)
results['humans.txt'] = 'is_favorite 目前与主数据库不同步'
return results
async def get_meili_max_id() -> int:
r = await client.index(INDEX_NAME).search(
query="",
limit=1,
attributes_to_retrieve=['id'],
sort=["id:desc"]
)
max_id = r.hits[0]['id'] if r.hits else 0
return max_id
@app.get('/api/stats')
@app.head('/api/stats')
@load_limiter
@ops_limiter
async def stats():
stats = await client.index(INDEX_NAME).get_stats()
max_id = await get_meili_max_id()
# us to date
last_indexed_at = datetime.fromtimestamp(max_id / 1000000, tz=timezone.utc)
return {"db_stats":stats,"max_id":max_id,"last_indexed_at":last_indexed_at.isoformat()}
@app.get('/api/search')
@load_limiter
@ops_limiter
async def search(q: str = 'saveweb', p: int = 0, f: str = 'false', h: str = 'false', sort: str = ""):
query = q # 搜索词
page = p # 0-based
fulltext = f == 'true' # 返回全文(搜索还是以全文做搜索,只是返回的时候限制一下长度)
highlight = h == 'true' # 高亮
print(query, page, 'fulltext:', fulltext, 'highlight:', highlight)
with open('search.log', 'a') as fio:
fio.write(query + '\t' + str(page) + '\n')
# 搜空,返空
if not query:
return {'error': '搜索词为空'}
opt_params = {
'limit': 10,
'offset': 10 * page,
'attributes_to_retrieve': ['id', 'id_feed', 'title', 'content', 'link', 'date', 'tags', 'author', 'lastSeen', 'content_length'],
}
# sort
if sort:
opt_params['sort'] = sort.split(',')
# 高级搜索
if '(' in query and query[-1] == ')' and query:
_filter = query[query.find('(') + 1:query.rfind(')')]
if not _filter:
return {'error': '搜索语法错误: empty filter'}
try:
_filter = magic_date_filter(_filter)
except Exception as e:
return {'error': 'magic_date_filter error: ' + str(e)}
query = query[:query.find('(')].strip() # 用 filter 时,query 可以空
print('real_filter:', _filter)
opt_params['filter'] = _filter
if not fulltext:
opt_params['attributes_to_crop'] = ['content']
opt_params['crop_length'] = 120
if highlight:
opt_params['attributes_to_highlight'] = ['title', 'content', 'date', 'tags', 'author']
opt_params['highlight_pre_tag'] = '<span class="uglyHighlight text-purple-500">'
opt_params['highlight_post_tag'] = '</span>'
try:
_results = await client.index(INDEX_NAME).search(query, **opt_params)
except meilisearch_python_sdk.errors.MeilisearchError as e:
if "invalid_search_filter" in str(e):
return {
'hits': [
{
'title': '搜索语法错误',
'content': '你这高级搜索写得有点东西哦😮: ' + e.message,
'author': '丑搜',
'date': int(time.time()),
'link': '#',
},
],
'error': '搜索语法错误: ' + e.message,
}
print('数据库错误', e)
return {
'hits': [
{
'title': '数据库错误',
'content': '查询数据库时出错。如果一直出现这个错误,说明数据库寄了,请反馈 ---- \n\n' + e.message,
'author': ';丑搜',
'date': int(time.time()),
'link': '#',
},
],
'error': '数据库错误: ' + e.message,
}
for hit in _results.hits:
# replace the hit with _formatted
if '_formatted' in hit:
hit.update(hit['_formatted'])
del hit['_formatted']
hit['author'] = '' if not hit['author'] else ';' +' ;'.join(hit['author'])
hit['tags'] = '' if not hit['tags'] else '#' + ' #'.join(hit['tags'])
try:
# TODO: 直接存解码好的
hit['link'] = html_unescape(hit['link'])
except Exception as e:
print('html_unescape error:', e)
pass
results = {
'hits': _results.hits,
'estimatedTotalHits': _results.estimated_total_hits, #TODO: estimatedTotalHits 改为 estimated_total_hits
'estimated_total_hits': _results.estimated_total_hits,
'humans.txt': '使用 API 时请检查 error 字段,高荷载/出错时会返回它',
}
return results
@app.route('/')
async def root(request):
return HTMLResponse(open('templates/index.html', 'r').read()) # 反正只有一个页面
async def main():
import hypercorn.asyncio
config = hypercorn.Config()
config.bind = ['[::]:8077']
await hypercorn.asyncio.serve(app, config)
if __name__ == '__main__':
# hypercorn --bind '[::]:8077' saveweb-search-backend:app
asyncio.run(main())