7 tahun lalu · 3bd3cc6a26
--- a/o2server/x_base_core_project/src/main/java/com/x/base/core/project/config/Person.java
+++ b/o2server/x_base_core_project/src/main/java/com/x/base/core/project/config/Person.java
--- a/o2server/x_base_core_project/src/main/java/com/x/base/core/project/config/Query.java
+++ b/o2server/x_base_core_project/src/main/java/com/x/base/core/project/config/Query.java
@@ -1,12 +1,11 @@
 
				 package com.x.base.core.project.config;
			
 
				 
			
 
				 import java.io.File;
			
 
				-import java.util.ArrayList;
			
 
				-import java.util.List;
			
 
				 
			
 
				 import org.apache.commons.io.FileUtils;
			
 
				 import org.apache.commons.lang3.BooleanUtils;
			
 
				 import org.apache.commons.lang3.StringUtils;
			
 
				+import org.apache.tika.utils.SystemUtils;
			
 
				 import org.quartz.CronExpression;
			
 
				 
			
 
				 import com.x.base.core.project.annotation.FieldDescribe;
			
@@ -82,7 +81,7 @@ public class Query extends ConfigObject {
 
				 	}
			
 
				 
			
 
				 	public Boolean getExtractImage() {
			
 
				-		return BooleanUtils.isTrue(extractImage);
			
 
				+		return SystemUtils.IS_OS_WINDOWS && BooleanUtils.isTrue(extractImage);
			
 
				 	}
			
 
				 
			
 
				 	public String getTessLanguage() {
			
--- a/o2server/x_base_core_project/src/main/java/com/x/base/core/project/tools/ExtractTextTools.java
+++ b/o2server/x_base_core_project/src/main/java/com/x/base/core/project/tools/ExtractTextTools.java
@@ -0,0 +1,179 @@
 
				+package com.x.base.core.project.tools;
			
 
				+
			
 
				+import java.awt.image.BufferedImage;
			
 
				+import java.io.ByteArrayInputStream;
			
 
				+import java.io.File;
			
 
				+import java.util.List;
			
 
				+
			
 
				+import javax.imageio.ImageIO;
			
 
				+
			
 
				+import org.apache.commons.collections4.list.UnmodifiableList;
			
 
				+import org.apache.commons.io.FileUtils;
			
 
				+import org.apache.commons.lang3.StringUtils;
			
 
				+import org.apache.pdfbox.cos.COSDocument;
			
 
				+import org.apache.pdfbox.io.RandomAccessBuffer;
			
 
				+import org.apache.pdfbox.pdfparser.PDFParser;
			
 
				+import org.apache.pdfbox.pdmodel.PDDocument;
			
 
				+import org.apache.pdfbox.text.PDFTextStripper;
			
 
				+import org.apache.tika.Tika;
			
 
				+import org.junit.Test;
			
 
				+
			
 
				+import com.x.base.core.project.config.Config;
			
 
				+import com.x.base.core.project.logger.Logger;
			
 
				+import com.x.base.core.project.logger.LoggerFactory;
			
 
				+import com.x.base.core.project.tools.DefaultCharset;
			
 
				+import com.x.base.core.project.tools.ListTools;
			
 
				+
			
 
				+import net.sourceforge.tess4j.Tesseract;
			
 
				+
			
 
				+public class ExtractTextTools {
			
 
				+
			
 
				+	private static Logger logger = LoggerFactory.getLogger(ExtractTextTools.class);
			
 
				+
			
 
				+	private static Tesseract tesseract = null;
			
 
				+
			
 
				+	private static Tika tika = null;
			
 
				+
			
 
				+	public static final Integer MAXLENGTH = 1024 * 1024 * 32;
			
 
				+
			
 
				+	public static boolean support(String name) {
			
 
				+		String ext = StringUtils.substringAfterLast(name, ".");
			
 
				+		if (StringUtils.isNotEmpty(ext)) {
			
 
				+			ext = "." + StringUtils.lowerCase(ext);
			
 
				+			return SUPPORT_TYPES.contains(ext);
			
 
				+		}
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	public static boolean supportImage(String name) {
			
 
				+		String ext = StringUtils.substringAfterLast(name, ".");
			
 
				+		if (StringUtils.isNotEmpty(ext)) {
			
 
				+			ext = "." + StringUtils.lowerCase(ext);
			
 
				+			return SUPPORT_IMAGE_TYPES.contains(ext);
			
 
				+		}
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	public static boolean available(byte[] bytes) {
			
 
				+		if (null == bytes || bytes.length == 0 || bytes.length > MAXLENGTH) {
			
 
				+			return false;
			
 
				+		}
			
 
				+		return true;
			
 
				+	}
			
 
				+
			
 
				+	public static final List<String> SUPPORT_TYPES = UnmodifiableList.unmodifiableList(ListTools.toList(".doc", ".docx",
			
 
				+			".pdf", ".xls", ".xlsx", ".txt", ".bmp", ".jpg", ".png", ".gif", ".jpeg", "jpe"));
			
 
				+
			
 
				+	public static final List<String> SUPPORT_IMAGE_TYPES = UnmodifiableList
			
 
				+			.unmodifiableList(ListTools.toList(".bmp", ".jpg", ".png", ".gif", ".jpeg", "jpe"));
			
 
				+
			
 
				+	public static String extract(byte[] bytes, String name, Boolean office, Boolean pdf, Boolean txt, Boolean image) {
			
 
				+		if ((null != bytes) && bytes.length > 0 && bytes.length < 1024 * 1024 * 10) {
			
 
				+			if (office) {
			
 
				+				if (StringUtils.endsWithIgnoreCase(name, ".doc") || StringUtils.endsWithIgnoreCase(name, ".docx")) {
			
 
				+					return word(bytes);
			
 
				+				}
			
 
				+				if (StringUtils.endsWithIgnoreCase(name, ".xls") || StringUtils.endsWithIgnoreCase(name, ".xlsx")) {
			
 
				+					return excel(bytes);
			
 
				+				}
			
 
				+			}
			
 
				+			if (pdf) {
			
 
				+				if (StringUtils.endsWithIgnoreCase(name, ".pdf")) {
			
 
				+					return pdf(bytes);
			
 
				+				}
			
 
				+			}
			
 
				+			if (txt) {
			
 
				+				if (StringUtils.endsWithIgnoreCase(name, ".txt")) {
			
 
				+					return text(bytes);
			
 
				+				}
			
 
				+			}
			
 
				+			if (image) {
			
 
				+				if (StringUtils.endsWithIgnoreCase(name, ".jpg") || StringUtils.endsWithIgnoreCase(name, ".png")
			
 
				+						|| StringUtils.endsWithIgnoreCase(name, ".gif") || StringUtils.endsWithIgnoreCase(name, ".bmp")
			
 
				+						|| StringUtils.endsWithIgnoreCase(name, ".jpeg")
			
 
				+						|| StringUtils.endsWithIgnoreCase(name, ".jpe")) {
			
 
				+					return image(bytes);
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		return null;
			
 
				+	}
			
 
				+
			
 
				+	public static String pdf(byte[] bytes) {
			
 
				+		try {
			
 
				+			PDFParser parser = new PDFParser(new RandomAccessBuffer(bytes));
			
 
				+			parser.parse();
			
 
				+			try (COSDocument cos = parser.getDocument(); PDDocument pd = new PDDocument(cos)) {
			
 
				+				PDFTextStripper stripper = new PDFTextStripper();
			
 
				+				stripper.setStartPage(1);
			
 
				+				stripper.setEndPage(pd.getNumberOfPages());
			
 
				+				return stripper.getText(pd);
			
 
				+			}
			
 
				+		} catch (Exception e) {
			
 
				+			logger.error(e);
			
 
				+		}
			
 
				+		return null;
			
 
				+	}
			
 
				+
			
 
				+	public static String word(byte[] bytes) {
			
 
				+		try (ByteArrayInputStream in = new ByteArrayInputStream(bytes)) {
			
 
				+			return tikaInstance().parseToString(in);
			
 
				+		} catch (Exception e) {
			
 
				+			logger.error(e);
			
 
				+		}
			
 
				+		return null;
			
 
				+	}
			
 
				+
			
 
				+	public static String excel(byte[] bytes) {
			
 
				+		try (ByteArrayInputStream in = new ByteArrayInputStream(bytes)) {
			
 
				+			return tikaInstance().parseToString(in);
			
 
				+		} catch (Exception e) {
			
 
				+			logger.error(e);
			
 
				+		}
			
 
				+		return null;
			
 
				+	}
			
 
				+
			
 
				+	public static String text(byte[] bytes) {
			
 
				+		return new String(bytes, DefaultCharset.charset);
			
 
				+	}
			
 
				+
			
 
				+	public static String image(byte[] bytes) {
			
 
				+		try (ByteArrayInputStream in = new ByteArrayInputStream(bytes)) {
			
 
				+			BufferedImage image = ImageIO.read(in);
			
 
				+			return tesseractInstance().doOCR(image);
			
 
				+		} catch (Exception e) {
			
 
				+			logger.error(e);
			
 
				+		}
			
 
				+		return null;
			
 
				+	}
			
 
				+
			
 
				+	private static Tesseract tesseractInstance() throws Exception {
			
 
				+		if (null == tesseract) {
			
 
				+			synchronized (ExtractTextTools.class) {
			
 
				+				if (null == tesseract) {
			
 
				+					tesseract = new Tesseract();
			
 
				+					tesseract.setDatapath(Config.dir_commons_tess4j_tessdata().getAbsolutePath());// 设置训练库的位置
			
 
				+					tesseract.setLanguage(Config.query().getTessLanguage());// 中文识别
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		return tesseract;
			
 
				+	}
			
 
				+
			
 
				+	private static Tika tikaInstance() throws Exception {
			
 
				+		if (null == tika) {
			
 
				+			synchronized (ExtractTextTools.class) {
			
 
				+				if (null == tika) {
			
 
				+					tika = new Tika();
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		return tika;
			
 
				+	}
			
 
				+
			
 
				+	@Test
			
 
				+	public void test1() throws Exception {
			
 
				+		System.out.println(word(FileUtils.readFileToByteArray(new File("d:/1.html"))));
			
 
				+	}
			
 
				+
			
 
				+}