roo00 6 лет назад
Родитель
Сommit
3bd3cc6a26

Разница между файлами не показана из-за своего большого размера
+ 0 - 0
o2server/x_base_core_project/src/main/java/com/x/base/core/project/config/Person.java


+ 2 - 3
o2server/x_base_core_project/src/main/java/com/x/base/core/project/config/Query.java

@@ -1,12 +1,11 @@
 package com.x.base.core.project.config;
 
 import java.io.File;
-import java.util.ArrayList;
-import java.util.List;
 
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang3.BooleanUtils;
 import org.apache.commons.lang3.StringUtils;
+import org.apache.tika.utils.SystemUtils;
 import org.quartz.CronExpression;
 
 import com.x.base.core.project.annotation.FieldDescribe;
@@ -82,7 +81,7 @@ public class Query extends ConfigObject {
 	}
 
 	public Boolean getExtractImage() {
-		return BooleanUtils.isTrue(extractImage);
+		return SystemUtils.IS_OS_WINDOWS && BooleanUtils.isTrue(extractImage);
 	}
 
 	public String getTessLanguage() {

+ 179 - 0
o2server/x_base_core_project/src/main/java/com/x/base/core/project/tools/ExtractTextTools.java

@@ -0,0 +1,179 @@
+package com.x.base.core.project.tools;
+
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.util.List;
+
+import javax.imageio.ImageIO;
+
+import org.apache.commons.collections4.list.UnmodifiableList;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.pdfbox.cos.COSDocument;
+import org.apache.pdfbox.io.RandomAccessBuffer;
+import org.apache.pdfbox.pdfparser.PDFParser;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.tika.Tika;
+import org.junit.Test;
+
+import com.x.base.core.project.config.Config;
+import com.x.base.core.project.logger.Logger;
+import com.x.base.core.project.logger.LoggerFactory;
+import com.x.base.core.project.tools.DefaultCharset;
+import com.x.base.core.project.tools.ListTools;
+
+import net.sourceforge.tess4j.Tesseract;
+
+public class ExtractTextTools {
+
+	private static Logger logger = LoggerFactory.getLogger(ExtractTextTools.class);
+
+	private static Tesseract tesseract = null;
+
+	private static Tika tika = null;
+
+	public static final Integer MAXLENGTH = 1024 * 1024 * 32;
+
+	public static boolean support(String name) {
+		String ext = StringUtils.substringAfterLast(name, ".");
+		if (StringUtils.isNotEmpty(ext)) {
+			ext = "." + StringUtils.lowerCase(ext);
+			return SUPPORT_TYPES.contains(ext);
+		}
+		return false;
+	}
+
+	public static boolean supportImage(String name) {
+		String ext = StringUtils.substringAfterLast(name, ".");
+		if (StringUtils.isNotEmpty(ext)) {
+			ext = "." + StringUtils.lowerCase(ext);
+			return SUPPORT_IMAGE_TYPES.contains(ext);
+		}
+		return false;
+	}
+
+	public static boolean available(byte[] bytes) {
+		if (null == bytes || bytes.length == 0 || bytes.length > MAXLENGTH) {
+			return false;
+		}
+		return true;
+	}
+
+	public static final List<String> SUPPORT_TYPES = UnmodifiableList.unmodifiableList(ListTools.toList(".doc", ".docx",
+			".pdf", ".xls", ".xlsx", ".txt", ".bmp", ".jpg", ".png", ".gif", ".jpeg", "jpe"));
+
+	public static final List<String> SUPPORT_IMAGE_TYPES = UnmodifiableList
+			.unmodifiableList(ListTools.toList(".bmp", ".jpg", ".png", ".gif", ".jpeg", "jpe"));
+
+	public static String extract(byte[] bytes, String name, Boolean office, Boolean pdf, Boolean txt, Boolean image) {
+		if ((null != bytes) && bytes.length > 0 && bytes.length < 1024 * 1024 * 10) {
+			if (office) {
+				if (StringUtils.endsWithIgnoreCase(name, ".doc") || StringUtils.endsWithIgnoreCase(name, ".docx")) {
+					return word(bytes);
+				}
+				if (StringUtils.endsWithIgnoreCase(name, ".xls") || StringUtils.endsWithIgnoreCase(name, ".xlsx")) {
+					return excel(bytes);
+				}
+			}
+			if (pdf) {
+				if (StringUtils.endsWithIgnoreCase(name, ".pdf")) {
+					return pdf(bytes);
+				}
+			}
+			if (txt) {
+				if (StringUtils.endsWithIgnoreCase(name, ".txt")) {
+					return text(bytes);
+				}
+			}
+			if (image) {
+				if (StringUtils.endsWithIgnoreCase(name, ".jpg") || StringUtils.endsWithIgnoreCase(name, ".png")
+						|| StringUtils.endsWithIgnoreCase(name, ".gif") || StringUtils.endsWithIgnoreCase(name, ".bmp")
+						|| StringUtils.endsWithIgnoreCase(name, ".jpeg")
+						|| StringUtils.endsWithIgnoreCase(name, ".jpe")) {
+					return image(bytes);
+				}
+			}
+		}
+		return null;
+	}
+
+	public static String pdf(byte[] bytes) {
+		try {
+			PDFParser parser = new PDFParser(new RandomAccessBuffer(bytes));
+			parser.parse();
+			try (COSDocument cos = parser.getDocument(); PDDocument pd = new PDDocument(cos)) {
+				PDFTextStripper stripper = new PDFTextStripper();
+				stripper.setStartPage(1);
+				stripper.setEndPage(pd.getNumberOfPages());
+				return stripper.getText(pd);
+			}
+		} catch (Exception e) {
+			logger.error(e);
+		}
+		return null;
+	}
+
+	public static String word(byte[] bytes) {
+		try (ByteArrayInputStream in = new ByteArrayInputStream(bytes)) {
+			return tikaInstance().parseToString(in);
+		} catch (Exception e) {
+			logger.error(e);
+		}
+		return null;
+	}
+
+	public static String excel(byte[] bytes) {
+		try (ByteArrayInputStream in = new ByteArrayInputStream(bytes)) {
+			return tikaInstance().parseToString(in);
+		} catch (Exception e) {
+			logger.error(e);
+		}
+		return null;
+	}
+
+	public static String text(byte[] bytes) {
+		return new String(bytes, DefaultCharset.charset);
+	}
+
+	public static String image(byte[] bytes) {
+		try (ByteArrayInputStream in = new ByteArrayInputStream(bytes)) {
+			BufferedImage image = ImageIO.read(in);
+			return tesseractInstance().doOCR(image);
+		} catch (Exception e) {
+			logger.error(e);
+		}
+		return null;
+	}
+
+	private static Tesseract tesseractInstance() throws Exception {
+		if (null == tesseract) {
+			synchronized (ExtractTextTools.class) {
+				if (null == tesseract) {
+					tesseract = new Tesseract();
+					tesseract.setDatapath(Config.dir_commons_tess4j_tessdata().getAbsolutePath());// 设置训练库的位置
+					tesseract.setLanguage(Config.query().getTessLanguage());// 中文识别
+				}
+			}
+		}
+		return tesseract;
+	}
+
+	private static Tika tikaInstance() throws Exception {
+		if (null == tika) {
+			synchronized (ExtractTextTools.class) {
+				if (null == tika) {
+					tika = new Tika();
+				}
+			}
+		}
+		return tika;
+	}
+
+	@Test
+	public void test1() throws Exception {
+		System.out.println(word(FileUtils.readFileToByteArray(new File("d:/1.html"))));
+	}
+
+}

Некоторые файлы не были показаны из-за большого количества измененных файлов