-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
341 lines (290 loc) · 8.68 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
#!/usr/bin/env node
// -*- coding: utf-8 -*-
//===============================================================================
//
// Copyright (c) 2020 <> All Rights Reserved
//
//
// File: /c/Users/Administrator/chatopera/docs.bot/index.js
// Author: Hai Liang Wang
// Date: 2022-04-28:13:33:30
//
//===============================================================================
/**
*
*/
const basedir = __dirname;
const workdir = process.cwd();
const debug = require("debug")("chatopera:docs:bot");
const fs = require('fs');
const path = require('path');
const yaml = require('js-yaml');
const readlineq = require("readlineq");
const tokenizer = require("./lib/tokenizer")
const { Shorturl } = require("./lib/shorturl")
const { hash, occurrences } = require("./lib/utils")
const shorturl = new Shorturl();
/**
* 一级目录说明
*/
const ROOT_CATEGORIES = {
"chatbot-platform": "Chatopera",
"cskefu": "春松客服"
}
function mapRootCategory(rootCategory) {
if (ROOT_CATEGORIES[rootCategory]) {
return ROOT_CATEGORIES[rootCategory]
}
return rootCategory
}
/**
* 忽略的问题
* 一下内容不作为问题
*/
const FILTER_OUT_POSTS = new Set();
FILTER_OUT_POSTS.add("下一步")
FILTER_OUT_POSTS.add("感谢您")
FILTER_OUT_POSTS.add("感谢你")
FILTER_OUT_POSTS.add("可能遇到的问题")
/**
* Resolve URL
* @param {*} from
* @param {*} to
* @returns
*/
function resolveUrl(from, to) {
const resolvedUrl = new URL(to, new URL(from, 'resolve://'));
if (resolvedUrl.protocol === 'resolve:') {
const { pathname, search, hash } = resolvedUrl;
return pathname + search + hash;
}
return resolvedUrl.toString();
}
/**
* 将 FAQ 按照约定格式输出为字符串
* @param {*} faqsMetadata
* @param {*} outputType
* @returns
*/
function renderOutputData(faqsMetadata, outputType, repoName) {
const outputMap = {
yaml() {
const result = {};
faqsMetadata.forEach((p) => {
result[p.post] = { answers: [p.reply] };
});
return yaml.dump({ [repoName]: result }, { lineWidth: -1 });
},
json() {
const result = faqsMetadata.map((p) => {
return p
});
return JSON.stringify(result, null, 2);
},
};
const outputFn = outputMap[outputType];
if (!outputFn) {
console.log('output type undefined');
return;
}
const outputStr = outputFn();
// debug("outputStr ", outputStr)
return outputStr
}
/**
* Get file title in H1 format
* @param {*} targetPath
* @returns
*/
function resolveTitle(targetPath) {
const mdStr = fs.readFileSync(targetPath, 'utf-8');
const match = mdStr.match(/# (.+)/);
if (match) {
const title = match[1];
return title
}
}
/**
* Generate FAQ data with markdown file
* @param {*} targetPath
* @param {*} title
* @param {*} url
* @returns
*/
async function processMdFileAsFaq(targetPath, url, title, rootCategory) {
const MKS_MARKER = "<!-- markup:skip-line -->"
debug("[processMdFileAsFaq] targetPath %s, url %s, title %s, rootCategory %s", targetPath, url, title, rootCategory)
const result = [];
const lines = await readlineq(targetPath)
let currentSection = title; // 当前所在的标题
let isInCodeBlock = false; // 当前是否在代码中
let isInCodeBlockCount = 0;
let isSectionLine = false;
for (let x of lines) {
debug("%s: %s", title, x)
// remove specific contents
x = x.trim()
x = x.replace(MKS_MARKER, "")
// resolve current section
let blockcount = occurrences(x, "```")
isInCodeBlockCount += blockcount;
if (isInCodeBlockCount % 2 == 0) {
isInCodeBlock = false
} else {
isInCodeBlock = true
}
if (isInCodeBlock == false) {
// 识别是否是标题
if (x.startsWith("#")) {
isSectionLine = true
let z = x.replace(/#/g, "").trim()
if (z) {
currentSection = z
}
} else {
// 不是标题
isSectionLine = false
}
} else {
// 忽略代码中内容
continue;
}
// #TODO 如需全文都处理,去掉这个判断条件
if (!isSectionLine) {
// 减少数据,优化体验,只将标题处理为问答对
continue;
}
let sents = tokenizer.split(x)
for (let y of sents) {
let pure = tokenizer.pure(y)
if (!pure) continue
// check post
let post = y.replace(/#/g, "").trim();
if (!post) continue;
// resolve category
let categories = [];
if (rootCategory)
categories.push(mapRootCategory(rootCategory))
categories.push(title)
if (currentSection != title)
categories.push(currentSection)
let link = `${url}#${encodeURIComponent(currentSection)}`
let docId = hash(y + title + link)
try {
let ret = await shorturl.create(link)
if (ret && ret.shortUrlIds && ret.shortUrlIds.length > 0) {
link = shorturl.getProvider() + "/" + ret.shortUrlIds[0]
} else {
// error
console.error(ret);
throw new Error("Error with shortUrl Service")
}
} catch (e) {
// bypass
continue;
}
let categoriesCopy = JSON.parse(JSON.stringify(categories));
// Link with Thumbnail
result.push({
docId: docId,
post: post,
replies: [
{
"rtype": "hyperlink",
"thumbnail": "/file/626b17379a63490018d128d2",
"title": `${post != currentSection ? (post.length > 20 ? (post.slice(0, 20) + "|") : (post + "|")) : ""}${categories.length > 2 ? categoriesCopy.reverse().slice(0, 2).join("|") : categoriesCopy.reverse().join("|")}`,
"content": "查看详情,快戳我~",
"url": link
}
],
categories: categories,
enabled: true,
})
// Text
// result.push({
// docId: docId,
// post: post,
// replies: [
// {
// "rtype": "plain",
// "content": `${post != currentSection ? (post.length > 20 ? (post.slice(0, 20) + "|") : (post + "|")) : ""}${categories.length > 2 ? categoriesCopy.reverse().slice(0, 2).join("|") : categoriesCopy.reverse().join("|")},访问详情 ${link}`
// },
// ],
// categories: categories,
// enabled: true,
// })
}
}
return result;
}
/**
* Main entry
* @param {*} options
*/
async function parse(options) {
const basePath = options.input;
const baseUrl = options.baseurl.endsWith("/") ? options.baseurl : options.baseurl + "/";
const baseFolders = options.folders ? options.folders.split(",") : null;
const shortUrlProvider = options.shorturl;
if (shortUrlProvider) {
shorturl.setProvider(shortUrlProvider)
}
if (baseFolders) {
for (let x of baseFolders) {
if (!fs.existsSync(path.join(basePath, x))) {
throw new Error(`Check base folder: ${path.join(basePath, x)} not exist.`)
}
}
}
if (!fs.existsSync(basePath)) {
throw new Error("input path path not exists")
}
const faqs = [];
const faqsPostDedup = new Set(); // 根据问题进行去重
const parsePath = async (dirPath, urlPath, rootCategory) => {
debug("parsePath %s, urlPath %s, rootCategory %s", dirPath, urlPath, rootCategory)
const names = fs.readdirSync(dirPath);
for (let name of names) {
const targetPath = path.join(dirPath, name);
const stat = fs.lstatSync(targetPath);
if (stat.isDirectory()) {
await parsePath(path.join(dirPath, name), path.join(urlPath, name), rootCategory);
} else if (stat.isFile() && /\.md$/.test(name)) {
console.log("parsing file ", targetPath, "...")
const url = resolveUrl(
baseUrl,
path.join(urlPath, name.replace('.md', '.html'))
);
let title = resolveTitle(targetPath);
let result = await processMdFileAsFaq(targetPath, url, title, rootCategory);
for (let y of result) {
if (FILTER_OUT_POSTS.has(y.post))
continue
if (faqsPostDedup.has(y.post)) {
// 已经包含该问题了
console.log("parse dedup", rootCategory, "-", y.post)
continue;
} else {
faqs.push(y) // { post: title, reply: url }
faqsPostDedup.add(y.post)
}
}
// fast output for every file
let outputData = renderOutputData(faqs, options.type, options.repo);
fs.writeFileSync(options.output, outputData);
}
}
};
if (baseFolders.length > 0) {
for (let x of baseFolders) {
let f = path.join(basePath, x);
await parsePath(f, x, x);
}
} else {
await parsePath(basePath, '');
}
console.log("File generated", options.output)
}
exports = module.exports = {
parse
}