|
|
@@ -14,6 +14,7 @@ import { Configuration, OpenAIApi } from 'azure-openai'
|
|
|
import { DataTypes, Sequelize } from 'sequelize'
|
|
|
import { ConfigService } from '@nestjs/config'
|
|
|
import { KnowledgeEmbedding } from './entities/knowledge-embedding.entity'
|
|
|
+import { Documents } from './entities/documents.entity'
|
|
|
import { VECTOR } from '../utils/pgvector'
|
|
|
import * as queue from 'fastq'
|
|
|
import { setTimeout } from 'timers/promises'
|
|
|
@@ -27,10 +28,18 @@ import { FileStatus } from './enums/file-status.enum'
|
|
|
import xlsx from 'node-xlsx'
|
|
|
import * as mime from 'mime'
|
|
|
import { OpenAIEmbeddings } from 'langchain/embeddings/openai'
|
|
|
+import { ChatOpenAI } from 'langchain/chat_models/openai'
|
|
|
+import { HumanMessage } from 'langchain/schema'
|
|
|
import { TypeORMVectorStore } from 'langchain/vectorstores/typeorm'
|
|
|
import { UnstructuredLoader } from 'langchain/document_loaders/fs/unstructured'
|
|
|
import { mkdtempSync, unlinkSync, rmdirSync, writeFileSync, rmSync } from 'fs'
|
|
|
import { join } from 'path'
|
|
|
+import { OpenAIParallelEmbeddings } from './embedding'
|
|
|
+import { ConversationalRetrievalQAChain } from 'langchain/chains'
|
|
|
+import { HNSWLib } from 'langchain/vectorstores/hnswlib'
|
|
|
+import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'
|
|
|
+import { BufferMemory } from 'langchain/memory'
|
|
|
+import { PromptTemplate } from 'langchain/prompts'
|
|
|
function formatEmbedding(embedding: number[]) {
|
|
|
return `[${embedding.join(', ')}]`
|
|
|
}
|
|
|
@@ -38,10 +47,9 @@ function formatEmbedding(embedding: number[]) {
|
|
|
@Injectable()
|
|
|
export class KnowledgeBaseService implements OnModuleInit {
|
|
|
private readonly tokenizer: Tiktoken
|
|
|
- private readonly openai: OpenAIApi
|
|
|
- private readonly embeddingApi: OpenAIApi
|
|
|
+ private embeddingModel: OpenAIParallelEmbeddings
|
|
|
+ private chatModel: ChatOpenAI
|
|
|
private readonly sequelize: Sequelize
|
|
|
- private embeddings: OpenAIEmbeddings
|
|
|
private vectorStore: TypeORMVectorStore
|
|
|
constructor(
|
|
|
@InjectRepository(KnowledgeBase)
|
|
|
@@ -52,28 +60,6 @@ export class KnowledgeBaseService implements OnModuleInit {
|
|
|
private readonly fileService: FileService
|
|
|
) {
|
|
|
this.tokenizer = get_encoding('cl100k_base')
|
|
|
- this.openai = new OpenAIApi(
|
|
|
- new Configuration({
|
|
|
- apiKey: process.env.AZURE_OPENAI_KEY,
|
|
|
- // add azure info into configuration
|
|
|
- azure: {
|
|
|
- apiKey: process.env.AZURE_OPENAI_KEY,
|
|
|
- endpoint: process.env.AZURE_OPENAI_ENDPOINT,
|
|
|
- deploymentName: process.env.AZURE_OPENAI_DEPLOYMENT
|
|
|
- }
|
|
|
- })
|
|
|
- )
|
|
|
- this.embeddingApi = new OpenAIApi(
|
|
|
- new Configuration({
|
|
|
- apiKey: process.env.AZURE_EMBEDDING_KEY,
|
|
|
- // add azure info into configuration
|
|
|
- azure: {
|
|
|
- apiKey: process.env.AZURE_EMBEDDING_KEY,
|
|
|
- endpoint: `https://${process.env.AZURE_EMBEDDING_INSTANCE}.openai.azure.com`,
|
|
|
- deploymentName: process.env.AZURE_EMBEDDING_DEPLOYMENT
|
|
|
- }
|
|
|
- })
|
|
|
- )
|
|
|
this.sequelize = new Sequelize({
|
|
|
dialect: 'postgres',
|
|
|
host: configService.get<string>('PG_HOST'),
|
|
|
@@ -84,6 +70,9 @@ export class KnowledgeBaseService implements OnModuleInit {
|
|
|
// logging: (msg) => Logger.debug(msg, 'Sequelize')
|
|
|
logging: false
|
|
|
})
|
|
|
+ }
|
|
|
+
|
|
|
+ async onModuleInit() {
|
|
|
KnowledgeEmbedding.init(
|
|
|
{
|
|
|
id: {
|
|
|
@@ -117,19 +106,46 @@ export class KnowledgeBaseService implements OnModuleInit {
|
|
|
},
|
|
|
{ sequelize: this.sequelize }
|
|
|
)
|
|
|
+ Documents.init(
|
|
|
+ {
|
|
|
+ id: {
|
|
|
+ primaryKey: true,
|
|
|
+ type: DataTypes.UUIDV4,
|
|
|
+ defaultValue: 'uuid_generate_v4()'
|
|
|
+ },
|
|
|
+ pageContent: {
|
|
|
+ type: DataTypes.TEXT
|
|
|
+ },
|
|
|
+ metadata: {
|
|
|
+ type: DataTypes.JSONB
|
|
|
+ },
|
|
|
+ embedding: {
|
|
|
+ type: new VECTOR(1536)
|
|
|
+ }
|
|
|
+ },
|
|
|
+ { sequelize: this.sequelize, tableName: 'documents' }
|
|
|
+ )
|
|
|
this.sequelize.sync()
|
|
|
- }
|
|
|
|
|
|
- async onModuleInit() {
|
|
|
- this.embeddings = new OpenAIEmbeddings({
|
|
|
+ this.embeddingModel = new OpenAIParallelEmbeddings({
|
|
|
azureOpenAIApiKey: process.env.AZURE_EMBEDDING_KEY,
|
|
|
azureOpenAIApiInstanceName: process.env.AZURE_EMBEDDING_INSTANCE,
|
|
|
azureOpenAIApiDeploymentName: process.env.AZURE_EMBEDDING_DEPLOYMENT,
|
|
|
+ azureOpenAIApiVersion: process.env.AZURE_EMBEDDING_VERSION,
|
|
|
+ azureOpenAIApiEmbeddingsDeploymentName: process.env.AZURE_EMBEDDING_DEPLOYMENT,
|
|
|
+ maxRetries: 5,
|
|
|
+ timeout: 15000
|
|
|
+ })
|
|
|
+ this.chatModel = new ChatOpenAI({
|
|
|
+ azureOpenAIApiKey: process.env.AZURE_OPENAI_KEY,
|
|
|
+ azureOpenAIApiInstanceName: process.env.AZURE_OPENAI_INSTANCE,
|
|
|
+ azureOpenAIApiDeploymentName: process.env.AZURE_OPENAI_DEPLOYMENT,
|
|
|
azureOpenAIApiVersion: process.env.AZURE_OPENAI_VERSION,
|
|
|
- verbose: true,
|
|
|
- maxConcurrency: 8
|
|
|
+ maxRetries: 5,
|
|
|
+ timeout: 15000,
|
|
|
+ temperature: 0.1
|
|
|
})
|
|
|
- this.vectorStore = await TypeORMVectorStore.fromDataSource(this.embeddings, {
|
|
|
+ this.vectorStore = await TypeORMVectorStore.fromDataSource(this.embeddingModel, {
|
|
|
postgresConnectionOptions: {
|
|
|
type: 'postgres',
|
|
|
host: process.env.PG_HOST,
|
|
|
@@ -163,6 +179,13 @@ export class KnowledgeBaseService implements OnModuleInit {
|
|
|
knowledgeId: knowledgeBaseId
|
|
|
}
|
|
|
})
|
|
|
+ await Documents.destroy({
|
|
|
+ where: {
|
|
|
+ metadata: {
|
|
|
+ knowledgeId: knowledgeBaseId
|
|
|
+ }
|
|
|
+ }
|
|
|
+ })
|
|
|
}
|
|
|
|
|
|
async getKnowledgeBaseById(knowledgeBaseId: number): Promise<KnowledgeBase> {
|
|
|
@@ -184,6 +207,13 @@ export class KnowledgeBaseService implements OnModuleInit {
|
|
|
fileId: knowledgeFileId
|
|
|
}
|
|
|
})
|
|
|
+ await Documents.destroy({
|
|
|
+ where: {
|
|
|
+ metadata: {
|
|
|
+ fileId: knowledgeFileId
|
|
|
+ }
|
|
|
+ }
|
|
|
+ })
|
|
|
}
|
|
|
|
|
|
public async uploadKnowledgeFile(file: Express.Multer.File, knowledgeId: number) {
|
|
|
@@ -216,76 +246,77 @@ export class KnowledgeBaseService implements OnModuleInit {
|
|
|
this.processExcelKnowledgeFile(knowledgeFile, buffer)
|
|
|
break
|
|
|
case 'application/pdf':
|
|
|
- this.processPdfKnowledgeFile(knowledgeFile, buffer)
|
|
|
- // this.processPdfKnowledgeFile1(knowledgeFile, buffer)
|
|
|
+ // this.processPdfKnowledgeFile(knowledgeFile, buffer)
|
|
|
+ this.processFile(knowledgeFile, buffer)
|
|
|
break
|
|
|
}
|
|
|
return knowledgeFile
|
|
|
}
|
|
|
|
|
|
- public async processPdfKnowledgeFile(knowledgeFile: KnowledgeFile, buffer: Buffer) {
|
|
|
- knowledgeFile.status = FileStatus.PROCESSING
|
|
|
- try {
|
|
|
- await this.knowledgeFileRepository.save(knowledgeFile)
|
|
|
- const pdf = await PdfParse(buffer)
|
|
|
- const contents = []
|
|
|
- let paragraph = ''
|
|
|
- pdf.text
|
|
|
- .trim()
|
|
|
- .split('\n')
|
|
|
- .forEach((line) => {
|
|
|
- line = line.trim()
|
|
|
- paragraph += line
|
|
|
- if (this.isFullSentence(line)) {
|
|
|
- contents.push(paragraph)
|
|
|
- paragraph = ''
|
|
|
- }
|
|
|
- })
|
|
|
- if (paragraph) {
|
|
|
- contents.push(paragraph)
|
|
|
- }
|
|
|
+ // public async processPdfKnowledgeFile(knowledgeFile: KnowledgeFile, buffer: Buffer) {
|
|
|
+ // knowledgeFile.status = FileStatus.PROCESSING
|
|
|
+ // try {
|
|
|
+ // await this.knowledgeFileRepository.save(knowledgeFile)
|
|
|
+ // const pdf = await PdfParse(buffer)
|
|
|
+ // const contents = []
|
|
|
+ // let paragraph = ''
|
|
|
+ // pdf.text
|
|
|
+ // .trim()
|
|
|
+ // .split('\n')
|
|
|
+ // .forEach((line) => {
|
|
|
+ // line = line.trim()
|
|
|
+ // paragraph += line
|
|
|
+ // if (this.isFullSentence(line)) {
|
|
|
+ // contents.push(paragraph)
|
|
|
+ // paragraph = ''
|
|
|
+ // }
|
|
|
+ // })
|
|
|
+ // if (paragraph) {
|
|
|
+ // contents.push(paragraph)
|
|
|
+ // }
|
|
|
|
|
|
- const embeddings = await this.createEmbeddings(
|
|
|
- contents.map((i) => {
|
|
|
- return { text: i }
|
|
|
- })
|
|
|
- )
|
|
|
- Logger.log(
|
|
|
- `create embeddings finished, total token usage: ${embeddings.reduce((acc, cur) => acc + cur.token, 0)}`
|
|
|
- )
|
|
|
- await KnowledgeEmbedding.destroy({
|
|
|
- where: {
|
|
|
- fileHash: knowledgeFile.fileHash
|
|
|
- }
|
|
|
- })
|
|
|
- let i = 0
|
|
|
- for (const item of embeddings) {
|
|
|
- try {
|
|
|
- await KnowledgeEmbedding.create({
|
|
|
- orgId: knowledgeFile.orgId,
|
|
|
- knowledgeId: knowledgeFile.knowledgeId,
|
|
|
- fileId: knowledgeFile.id,
|
|
|
- fileHash: knowledgeFile.fileHash,
|
|
|
- text: item.text,
|
|
|
- embedding: formatEmbedding(item.embedding),
|
|
|
- index: i++
|
|
|
- })
|
|
|
- } catch (error) {
|
|
|
- Logger.error(error.message)
|
|
|
- }
|
|
|
- }
|
|
|
- knowledgeFile.status = FileStatus.DONE
|
|
|
- await this.knowledgeFileRepository.save(knowledgeFile)
|
|
|
- } catch (e) {
|
|
|
- knowledgeFile.status = FileStatus.FAILED
|
|
|
- knowledgeFile.error = e.message
|
|
|
- await this.knowledgeFileRepository.save(knowledgeFile)
|
|
|
- }
|
|
|
- }
|
|
|
+ // const embeddings = await this.createEmbeddings(
|
|
|
+ // contents.map((i) => {
|
|
|
+ // return { text: i }
|
|
|
+ // })
|
|
|
+ // )
|
|
|
+ // Logger.log(
|
|
|
+ // `create embeddings finished, total token usage: ${embeddings.reduce((acc, cur) => acc + cur.token, 0)}`
|
|
|
+ // )
|
|
|
+ // await KnowledgeEmbedding.destroy({
|
|
|
+ // where: {
|
|
|
+ // fileHash: knowledgeFile.fileHash
|
|
|
+ // }
|
|
|
+ // })
|
|
|
+ // let i = 0
|
|
|
+ // for (const item of embeddings) {
|
|
|
+ // try {
|
|
|
+ // await KnowledgeEmbedding.create({
|
|
|
+ // orgId: knowledgeFile.orgId,
|
|
|
+ // knowledgeId: knowledgeFile.knowledgeId,
|
|
|
+ // fileId: knowledgeFile.id,
|
|
|
+ // fileHash: knowledgeFile.fileHash,
|
|
|
+ // text: item.text,
|
|
|
+ // embedding: formatEmbedding(item.embedding),
|
|
|
+ // index: i++
|
|
|
+ // })
|
|
|
+ // } catch (error) {
|
|
|
+ // Logger.error(error.message)
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // knowledgeFile.status = FileStatus.DONE
|
|
|
+ // await this.knowledgeFileRepository.save(knowledgeFile)
|
|
|
+ // } catch (e) {
|
|
|
+ // knowledgeFile.status = FileStatus.FAILED
|
|
|
+ // knowledgeFile.error = e.message
|
|
|
+ // await this.knowledgeFileRepository.save(knowledgeFile)
|
|
|
+ // }
|
|
|
+ // }
|
|
|
|
|
|
- public async processPdfKnowledgeFile1(knowledgeFile: KnowledgeFile, buffer: Buffer) {
|
|
|
+ public async processFile(knowledgeFile: KnowledgeFile, buffer: Buffer) {
|
|
|
knowledgeFile.status = FileStatus.PROCESSING
|
|
|
try {
|
|
|
+ await this.knowledgeFileRepository.save(knowledgeFile)
|
|
|
const tmpDir = mkdtempSync('file')
|
|
|
const tmpFile = join(tmpDir, knowledgeFile.fileName)
|
|
|
writeFileSync(tmpFile, buffer)
|
|
|
@@ -294,14 +325,29 @@ export class KnowledgeBaseService implements OnModuleInit {
|
|
|
strategy: 'fast'
|
|
|
})
|
|
|
const docs = await loader.load()
|
|
|
- rmSync(tmpDir, { recursive: true })
|
|
|
- docs.forEach((doc) => {
|
|
|
- doc.metadata.orgId = knowledgeFile.orgId
|
|
|
- doc.metadata.knowledgeId = knowledgeFile.knowledgeId
|
|
|
- doc.metadata.fileId = knowledgeFile.id
|
|
|
- doc.metadata.fileHash = knowledgeFile.fileHash
|
|
|
+ const splitter = new RecursiveCharacterTextSplitter({
|
|
|
+ chunkSize: 300,
|
|
|
+ chunkOverlap: 100
|
|
|
})
|
|
|
- await this.vectorStore.addDocuments(docs)
|
|
|
+ let output: any[] = await splitter.createDocuments(
|
|
|
+ [docs.map((i) => i.pageContent).join('\n')],
|
|
|
+ [
|
|
|
+ {
|
|
|
+ orgId: knowledgeFile.orgId,
|
|
|
+ knowledgeId: knowledgeFile.knowledgeId,
|
|
|
+ fileId: knowledgeFile.id,
|
|
|
+ fileHash: knowledgeFile.fileHash,
|
|
|
+ fileName: knowledgeFile.fileName
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ {
|
|
|
+ chunkHeader: `DOCUMENT NAME: [${knowledgeFile.fileName}](${knowledgeFile.fileUrl})\n\n---\n\n`,
|
|
|
+ appendChunkOverlapHeader: true
|
|
|
+ }
|
|
|
+ )
|
|
|
+ Logger.log(`load ${output.length} docs`)
|
|
|
+ rmSync(tmpDir, { recursive: true })
|
|
|
+ await this.vectorStore.addDocuments(output)
|
|
|
knowledgeFile.status = FileStatus.DONE
|
|
|
await this.knowledgeFileRepository.save(knowledgeFile)
|
|
|
} catch (e) {
|
|
|
@@ -395,47 +441,28 @@ export class KnowledgeBaseService implements OnModuleInit {
|
|
|
return result.filter((i) => i && i.embedding)
|
|
|
}
|
|
|
|
|
|
- async getEmbedding(content: string, retry = 0) {
|
|
|
+ async getEmbedding(content: string) {
|
|
|
try {
|
|
|
- const response = await this.embeddingApi.createEmbedding({
|
|
|
- model: 'embedding',
|
|
|
- input: content
|
|
|
- })
|
|
|
+ const response = await this.embeddingModel.embedQuery(content)
|
|
|
return {
|
|
|
text: content,
|
|
|
- embedding: response.data.data[0].embedding,
|
|
|
- token: response.data.usage.total_tokens
|
|
|
+ embedding: response,
|
|
|
+ token: this.tokenizer.encode(content).length
|
|
|
}
|
|
|
} catch (error) {
|
|
|
- if (retry < 3) {
|
|
|
- Logger.error(`fetchEmbedding error: ${error.message}, retry ${retry}`, 'fetchEmbedding')
|
|
|
- await setTimeout(2000)
|
|
|
- return await this.getEmbedding(content, retry + 1)
|
|
|
- }
|
|
|
Logger.error(error.stack, 'fetchEmbedding')
|
|
|
throw new InternalServerErrorException(error.message)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
async getKeywords(text: string) {
|
|
|
- try {
|
|
|
- const res = await this.openai.createChatCompletion({
|
|
|
- model: 'gpt-35-turbo',
|
|
|
- messages: [
|
|
|
- {
|
|
|
- role: 'user',
|
|
|
- content: `You need to extract keywords from the statement or question and return a series of keywords separated by commas.\ncontent: ${text}\nkeywords: `
|
|
|
- }
|
|
|
- ]
|
|
|
- })
|
|
|
- return res.data.choices[0].message.content
|
|
|
- } catch (error) {
|
|
|
- Logger.error(error.message)
|
|
|
- if (error.response) {
|
|
|
- Logger.error(error.response.data)
|
|
|
- }
|
|
|
- throw new InternalServerErrorException(error.message)
|
|
|
- }
|
|
|
+ return (
|
|
|
+ await this.chatModel.call([
|
|
|
+ new HumanMessage(
|
|
|
+ `You need to extract keywords from the statement or question and return a series of keywords separated by commas.\ncontent: ${text}\nkeywords: `
|
|
|
+ )
|
|
|
+ ])
|
|
|
+ ).content
|
|
|
}
|
|
|
|
|
|
cutContext(context: string[]) {
|
|
|
@@ -474,4 +501,40 @@ export class KnowledgeBaseService implements OnModuleInit {
|
|
|
)
|
|
|
return context
|
|
|
}
|
|
|
+
|
|
|
+ async askKnowledge1(options: { question: string; orgId: number; knowledgeId?: number; fileId?: number }) {
|
|
|
+ /* Create the chain */
|
|
|
+ const chain = ConversationalRetrievalQAChain.fromLLM(
|
|
|
+ this.chatModel,
|
|
|
+ this.vectorStore.asRetriever(100, {
|
|
|
+ orgId: options.orgId
|
|
|
+ }),
|
|
|
+ {
|
|
|
+ memory: new BufferMemory({
|
|
|
+ memoryKey: 'chat_history' // Must be set to "chat_history"
|
|
|
+ }),
|
|
|
+ qaChainOptions: {
|
|
|
+ prompt: PromptTemplate.fromTemplate(`(以下内容是相关的本地知识库信息,按照相关程度从高到低排序,第一行是文件名和链接)
|
|
|
+\`\`\`
|
|
|
+{context}
|
|
|
+\`\`\`
|
|
|
+请仔细思考,优先使用以上提供的内容进行回答,并告诉我相关的文件和链接。如果无法从内容中找到答案,你需要根据给你自己掌握的知识回答问题。
|
|
|
+请确保使用中文进行回答。
|
|
|
+请在末尾附上相关的文件链接。`),
|
|
|
+ questionPrompt: PromptTemplate.fromTemplate(`{question}`)
|
|
|
+ },
|
|
|
+ questionGeneratorChainOptions: {
|
|
|
+ template: `Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
|
|
|
+
|
|
|
+Chat History:
|
|
|
+{chat_history}
|
|
|
+Follow Up Input: {question}
|
|
|
+Standalone question:`
|
|
|
+ }
|
|
|
+ }
|
|
|
+ )
|
|
|
+ /* Ask it a question */
|
|
|
+ const res = await chain.call({ question: options.question })
|
|
|
+ console.log(res)
|
|
|
+ }
|
|
|
}
|