pdf.mjs 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. import PdfParse from '@cyber2024/pdf-parse-fixed'
  2. import { readFileSync } from 'fs'
  3. import cld from 'cld'
  4. import { Configuration, OpenAIApi } from 'azure-openai'
  5. import pg from 'pg'
  6. async function pdf2text(path) {
  7. const pdf = await PdfParse(readFileSync(path))
  8. const contents = []
  9. let newParagraph = ''
  10. pdf.text
  11. .trim()
  12. .split('\n')
  13. .forEach((line) => {
  14. line = line.trim()
  15. newParagraph += line
  16. if (isFullSentence(line)) {
  17. contents.push(newParagraph)
  18. newParagraph = ''
  19. }
  20. })
  21. if (newParagraph) {
  22. contents.push(newParagraph)
  23. }
  24. const lang = await cld.detect(contents.join('\n'))
  25. console.log(contents.length)
  26. }
  27. function isFullSentence(str) {
  28. return /[.!?。!?…;;::”’)】》」』〕〉》〗〞〟»"'\])}]+$/.test(str)
  29. }
  30. await pdf2text('/Users/drew/Downloads/《Python 3学习笔记(上卷)》_1-50.pdf')
  31. const openai = new OpenAIApi(
  32. new Configuration({
  33. apiKey: 'beb32e4625a94b65ba8bc0ba1688c4d2',
  34. // add azure info into configuration
  35. azure: {
  36. apiKey: 'beb32e4625a94b65ba8bc0ba1688c4d2',
  37. endpoint: 'https://zouma.openai.azure.com/'
  38. }
  39. })
  40. )
  41. // const response = await openai.createEmbedding({
  42. // model: 'embedding',
  43. // input: 'The food was delicious and the waiter...'
  44. // })
  45. // console.log(JSON.stringify(response.data, null, 4))
  46. const client = new pg.Client({
  47. host: '47.97.42.229',
  48. port: 5432,
  49. user: 'postgres',
  50. password: 'D$&g3a9BCJH&$Nzh',
  51. database: 'gpt_test',
  52. connectionTimeoutMillis: 5000
  53. })
  54. await client.connect()
  55. //table exists
  56. client.query(`create table if not exists public.chat_embedding (
  57. id integer primary key not null default nextval('embedding_id_seq'::regclass),
  58. name character varying,
  59. text character varying,
  60. embedding vector(1536)
  61. );`)
  62. const res = await client.query('SELECT * FROM chat_embedding')
  63. console.log(res.rows)
  64. client.end()