|
@@ -23,6 +23,8 @@ import { Pagination, paginate } from 'nestjs-typeorm-paginate'
|
|
|
import { KnowledgeFile } from './entities/knowledge-file.entity'
|
|
import { KnowledgeFile } from './entities/knowledge-file.entity'
|
|
|
import { FileService } from 'src/file/file.service'
|
|
import { FileService } from 'src/file/file.service'
|
|
|
import { FileStatus } from './enums/file-status.enum'
|
|
import { FileStatus } from './enums/file-status.enum'
|
|
|
|
|
+import xlsx from 'node-xlsx'
|
|
|
|
|
+import * as mime from 'mime'
|
|
|
|
|
|
|
|
function formatEmbedding(embedding: number[]) {
|
|
function formatEmbedding(embedding: number[]) {
|
|
|
return `[${embedding.join(', ')}]`
|
|
return `[${embedding.join(', ')}]`
|
|
@@ -70,7 +72,7 @@ export class KnowledgeBaseService {
|
|
|
type: DataTypes.INTEGER
|
|
type: DataTypes.INTEGER
|
|
|
},
|
|
},
|
|
|
orgId: {
|
|
orgId: {
|
|
|
- type: DataTypes.INTEGER
|
|
|
|
|
|
|
+ type: DataTypes.INTEGER
|
|
|
},
|
|
},
|
|
|
knowledgeId: {
|
|
knowledgeId: {
|
|
|
type: DataTypes.INTEGER
|
|
type: DataTypes.INTEGER
|
|
@@ -155,7 +157,7 @@ export class KnowledgeBaseService {
|
|
|
const { url: fileUrl } = await this.fileService.uploadBuffer(
|
|
const { url: fileUrl } = await this.fileService.uploadBuffer(
|
|
|
buffer,
|
|
buffer,
|
|
|
mimetype.split('/')[1],
|
|
mimetype.split('/')[1],
|
|
|
- originalname.split('.').slice(-1)
|
|
|
|
|
|
|
+ mime.getExtension(mimetype)
|
|
|
)
|
|
)
|
|
|
knowledgeFile = new KnowledgeFile()
|
|
knowledgeFile = new KnowledgeFile()
|
|
|
knowledgeFile.orgId = knowledgeBase.orgId
|
|
knowledgeFile.orgId = knowledgeBase.orgId
|
|
@@ -165,12 +167,18 @@ export class KnowledgeBaseService {
|
|
|
knowledgeFile.fileName = fileName
|
|
knowledgeFile.fileName = fileName
|
|
|
knowledgeFile.size = size
|
|
knowledgeFile.size = size
|
|
|
knowledgeFile.fileUrl = fileUrl
|
|
knowledgeFile.fileUrl = fileUrl
|
|
|
- await this.knowledgeFileRepository.save(knowledgeFile)
|
|
|
|
|
- this.processKnowledgeFile(knowledgeFile, buffer)
|
|
|
|
|
|
|
+ switch (mimetype) {
|
|
|
|
|
+ case 'application/pdf':
|
|
|
|
|
+ await this.processPdfKnowledgeFile(knowledgeFile, buffer)
|
|
|
|
|
+ break
|
|
|
|
|
+ case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
|
|
|
|
|
+ await this.processExcelKnowledgeFile(knowledgeFile, buffer)
|
|
|
|
|
+ break
|
|
|
|
|
+ }
|
|
|
return knowledgeFile
|
|
return knowledgeFile
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- public async processKnowledgeFile(knowledgeFile: KnowledgeFile, buffer: Buffer) {
|
|
|
|
|
|
|
+ public async processPdfKnowledgeFile(knowledgeFile: KnowledgeFile, buffer: Buffer) {
|
|
|
knowledgeFile.status = FileStatus.PROCESSING
|
|
knowledgeFile.status = FileStatus.PROCESSING
|
|
|
try {
|
|
try {
|
|
|
await this.knowledgeFileRepository.save(knowledgeFile)
|
|
await this.knowledgeFileRepository.save(knowledgeFile)
|
|
@@ -192,7 +200,11 @@ export class KnowledgeBaseService {
|
|
|
contents.push(paragraph)
|
|
contents.push(paragraph)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- const embeddings = await this.createEmbeddings(contents)
|
|
|
|
|
|
|
+ const embeddings = await this.createEmbeddings(
|
|
|
|
|
+ contents.map((i) => {
|
|
|
|
|
+ return { text: i }
|
|
|
|
|
+ })
|
|
|
|
|
+ )
|
|
|
Logger.log(
|
|
Logger.log(
|
|
|
`create embeddings finished, total token usage: ${embeddings.reduce((acc, cur) => acc + cur.token, 0)}`
|
|
`create embeddings finished, total token usage: ${embeddings.reduce((acc, cur) => acc + cur.token, 0)}`
|
|
|
)
|
|
)
|
|
@@ -226,6 +238,57 @@ export class KnowledgeBaseService {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ public async processExcelKnowledgeFile(knowledgeFile: KnowledgeFile, buffer: Buffer) {
|
|
|
|
|
+ knowledgeFile.status = FileStatus.PROCESSING
|
|
|
|
|
+ try {
|
|
|
|
|
+ await this.knowledgeFileRepository.save(knowledgeFile)
|
|
|
|
|
+ const sheets = xlsx.parse(buffer)
|
|
|
|
|
+ for (let sheet of sheets) {
|
|
|
|
|
+ const data = sheet.data.filter((i) => i[0].length && i[1].length)
|
|
|
|
|
+ const contents = data.map((i) => {
|
|
|
|
|
+ return {
|
|
|
|
|
+ text: i[0],
|
|
|
|
|
+ detail: i[1]
|
|
|
|
|
+ }
|
|
|
|
|
+ })
|
|
|
|
|
+ const embeddings = await this.createEmbeddings(contents)
|
|
|
|
|
+ Logger.log(
|
|
|
|
|
+ `create embeddings finished, total token usage: ${embeddings.reduce(
|
|
|
|
|
+ (acc, cur) => acc + cur.token,
|
|
|
|
|
+ 0
|
|
|
|
|
+ )}`
|
|
|
|
|
+ )
|
|
|
|
|
+ await KnowledgeEmbedding.destroy({
|
|
|
|
|
+ where: {
|
|
|
|
|
+ fileHash: knowledgeFile.fileHash
|
|
|
|
|
+ }
|
|
|
|
|
+ })
|
|
|
|
|
+ let i = 0
|
|
|
|
|
+ for (const item of embeddings) {
|
|
|
|
|
+ try {
|
|
|
|
|
+ await KnowledgeEmbedding.create({
|
|
|
|
|
+ orgId: knowledgeFile.orgId,
|
|
|
|
|
+ knowledgeId: knowledgeFile.knowledgeId,
|
|
|
|
|
+ fileId: knowledgeFile.id,
|
|
|
|
|
+ fileHash: knowledgeFile.fileHash,
|
|
|
|
|
+ text: item.text + '\n' + item.detail,
|
|
|
|
|
+ embedding: formatEmbedding(item.embedding),
|
|
|
|
|
+ index: i++
|
|
|
|
|
+ })
|
|
|
|
|
+ } catch (error) {
|
|
|
|
|
+ Logger.error(error.message)
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ knowledgeFile.status = FileStatus.DONE
|
|
|
|
|
+ await this.knowledgeFileRepository.save(knowledgeFile)
|
|
|
|
|
+ }
|
|
|
|
|
+ } catch (e) {
|
|
|
|
|
+ knowledgeFile.status = FileStatus.FAILED
|
|
|
|
|
+ knowledgeFile.error = e.message
|
|
|
|
|
+ await this.knowledgeFileRepository.save(knowledgeFile)
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
isFullSentence(str) {
|
|
isFullSentence(str) {
|
|
|
return /[.!?。!?…;;::”’)】》」』〕〉》〗〞〟»"'\])}]+$/.test(str)
|
|
return /[.!?。!?…;;::”’)】》」』〕〉》〗〞〟»"'\])}]+$/.test(str)
|
|
|
}
|
|
}
|
|
@@ -236,22 +299,25 @@ export class KnowledgeBaseService {
|
|
|
return hash.digest('hex')
|
|
return hash.digest('hex')
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- async createEmbeddings(content: string[]) {
|
|
|
|
|
|
|
+ async createEmbeddings(content: { text: string }[]) {
|
|
|
const self = this
|
|
const self = this
|
|
|
const result = Array(content.length)
|
|
const result = Array(content.length)
|
|
|
async function worker(arg) {
|
|
async function worker(arg) {
|
|
|
- result[arg.index] = await self.getEmbedding(arg.text)
|
|
|
|
|
|
|
+ result[arg.index] = {
|
|
|
|
|
+ ...arg,
|
|
|
|
|
+ ...(await self.getEmbedding(arg.text))
|
|
|
|
|
+ }
|
|
|
Logger.log(`create embedding for ${arg.index + 1}/${content.length}`)
|
|
Logger.log(`create embedding for ${arg.index + 1}/${content.length}`)
|
|
|
}
|
|
}
|
|
|
const q = queue.promise(worker, 32)
|
|
const q = queue.promise(worker, 32)
|
|
|
- content.forEach((text, index) => {
|
|
|
|
|
|
|
+ content.forEach((item, index) => {
|
|
|
q.push({
|
|
q.push({
|
|
|
- text,
|
|
|
|
|
|
|
+ ...item,
|
|
|
index
|
|
index
|
|
|
})
|
|
})
|
|
|
})
|
|
})
|
|
|
await q.drained()
|
|
await q.drained()
|
|
|
- return result.filter((i) => i && i.text)
|
|
|
|
|
|
|
+ return result.filter((i) => i && i.embedding)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
async getEmbedding(content: string, retry = 0) {
|
|
async getEmbedding(content: string, retry = 0) {
|