|
|
@@ -129,6 +129,59 @@ export class ChatPdfService {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ public async firstUpload(file: Express.Multer.File) {
|
|
|
+ const { originalname, buffer, mimetype } = file
|
|
|
+ let md5 = this.calculateMD5(buffer)
|
|
|
+ const res = await ChatEmbedding.findAll({
|
|
|
+ where: {
|
|
|
+ name: md5
|
|
|
+ }
|
|
|
+ })
|
|
|
+ if (res.length) {
|
|
|
+ return {
|
|
|
+ name: md5
|
|
|
+ }
|
|
|
+ }
|
|
|
+ const pdf = await PdfParse(buffer)
|
|
|
+ const contents = []
|
|
|
+ let paragraph = ''
|
|
|
+ pdf.text
|
|
|
+ .trim()
|
|
|
+ .split('\n')
|
|
|
+ .forEach((line) => {
|
|
|
+ line = line.trim()
|
|
|
+ paragraph += line
|
|
|
+ if (this.isFullSentence(line)) {
|
|
|
+ contents.push(paragraph)
|
|
|
+ paragraph = ''
|
|
|
+ }
|
|
|
+ })
|
|
|
+ if (paragraph) {
|
|
|
+ contents.push(paragraph)
|
|
|
+ }
|
|
|
+
|
|
|
+ const embeddings = await this.createEmbeddings(contents)
|
|
|
+ Logger.log(
|
|
|
+ `create embeddings finished, total token usage: ${embeddings.reduce((acc, cur) => acc + cur.token, 0)}`
|
|
|
+ )
|
|
|
+ let i = 0
|
|
|
+ for (const item of embeddings) {
|
|
|
+ try {
|
|
|
+ await ChatEmbedding.create({
|
|
|
+ name: md5,
|
|
|
+ text: item.text,
|
|
|
+ num: i++,
|
|
|
+ embedding: formatEmbedding(item.embedding)
|
|
|
+ })
|
|
|
+ } catch (error) {
|
|
|
+ Logger.error(error.message)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return {
|
|
|
+ name: md5
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
isFullSentence(str) {
|
|
|
return /[.!?。!?…;;::”’)】》」』〕〉》〗〞〟»"'\])}]+$/.test(str)
|
|
|
}
|