java读取doc文本,java读取doc文件内容

本文目录一览：

1、java读取word文件的内容
2、java读取带格式word内容
3、JAVA有什么好的方法可以将word里的文本读取出来
4、java读取doc,pdf问题。
5、如何使用JAVA，POI读写word文档
6、java如何获得doc文件内容

java读取word文件的内容

WordExtractor的用法错了，你用下面的方法试试：

......

File file = new File(doc);

FileInputStream fileInputStream = getFileInputStream(file);

WordExtractor wordExtractor = new WordExtractor(fileInputStream);

String text = wordExtractor.getText();

......

java读取doc文本,java读取doc文件内容

java读取带格式word内容

用jacob吧。。

/**

*@author eyuan

package per.eyuan.word2txt.core;

import com.jacob.*;

import com.jacob.com.*;

import com.jacob.activeX.*;

import java.io.*;

import java.util.Scanner;

public class Core {

/**

* 实现转换的函数

* @param sourceFilesPath

* @param destinationFilesPath

* @param destinationFilesType

* @return void

* @see import com.jacob.activeX.*;

public static void change(String sourceFilesPath,String destinationFilesPath,int destinationFilesType){

//使用word文件所在的目录（源路径）建立目录文件

File sourcePathFile=new File(sourceFilesPath);

//取得word文件（源文件列表）

File sourceFilesList[]=sourcePathFile.listFiles();

System.out.println("共有"+sourceFilesList.length+"个文件（文件夹）");

//指定要转换的文件所在的目录下，如果有子目录，

//则进入子目录，继续查找word文档并将其转换，

//直到将指定目录下的所有word文档转换完。

//子目录名

String sourceChildPath=new String("");

//保持原来的层次关系，将子目录下的文件存放在新建的子目录中

String destiNationChildPath=new String("");

//检索文件，过滤掉非word文件，通过扩展名过滤

for(int i=0;isourceFilesList.length;i++){

//排除掉子文件夹

if(sourceFilesList[i].isFile()){

System.out.println("第"+(i+1)+"个文件：");

//取得文件全名（包含扩展名）

String fileName=sourceFilesList[i].getName();

String fileType=new String("");

//取得文件扩展名

fileType=fileName.substring((fileName.length()-4), fileName.length());

//word2007-2010扩展名为docx

//判断是否为word2007-2010文档，及是否以docx为后缀名

if(fileType.equals("docx")){

System.out.println("正在转换。。。");

//输出word文档所在路劲

System.out.println("目录："+sourceFilesPath);

//输出word文档名

System.out.println("文件名："+fileName);

//System.out.println(fileName.substring(0, (fileName.length()-5)));

//核心函数

//启动word

ActiveXComponent app=new ActiveXComponent("Word.Application");

//要转换的文档的全路径（所在文件夹+文件全名）

String docPath=sourceFilesPath+"\\"+fileName;

//转换后的文档的全路径（所在文件夹+文件名）

String othersPath=destinationFilesPath+"\\"+fileName.substring(0,(fileName.length()-5));

String inFile=docPath;

String outFile=othersPath;

boolean flag=false;

//核心代码

try{

//设置word可见性

app.setProperty("Visible", new Variant(false));

Dispatch docs=app.getProperty("Documents").toDispatch();

//打开word文档

Dispatch doc=Dispatch.invoke(docs, "Open", Dispatch.Method, new Object[]{inFile,new Variant(false),new Variant(true)}, new int[1]).toDispatch();

//0:Microsoft Word 97 - 2003 文档 (.doc)

//1:Microsoft Word 97 - 2003 模板 (.dot)

//2:文本文档 (.txt)

//3:文本文档 (.txt)

//4:文本文档 (.txt)

//5:文本文档 (.txt)

//6:RTF 格式 (.rtf)

//7:文本文档 (.txt)

//8:HTML 文档 (.htm)(带文件夹)

//9:MHTML 文档 (.mht)(单文件)

//10:MHTML 文档 (.mht)(单文件)

//11:XML 文档 (.xml)

//12:Microsoft Word 文档 (.docx)

//13:Microsoft Word 启用宏的文档 (.docm)

//14:Microsoft Word 模板 (.dotx)

//15:Microsoft Word 启用宏的模板 (.dotm)

//16:Microsoft Word 文档 (.docx)

//17:PDF 文件 (.pdf)

//18:XPS 文档 (.xps)

//19:XML 文档 (.xml)

//20:XML 文档 (.xml)

//21:XML 文档 (.xml)

//22:XML 文档 (.xml)

//23:OpenDocument 文本 (.odt)

//24:WTF 文件 (.wtf)

//另存为指定格式的文档

Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[]{outFile,new Variant(destinationFilesType)}, new int[1]);

Variant file=new Variant(false);

//关闭文档

Dispatch.call(doc, "Close",file);

flag=true;

}catch(Exception e){

e.printStackTrace();

System.out.println("文档转换失败");

}finally{

app.invoke("Quit",new Variant[]{});

}

System.out.println("转换完毕");

}

//word97-2003扩展名为doc

//判断是否为word2003-2007文档，及是否以doc为后缀名

else if(fileType.equals(".doc")){

System.out.println("正在转换。。。");

//输出word文档所在路劲

System.out.println("目录："+sourceFilesPath);

//输出word文档名

System.out.println("文件名："+fileName);

//System.out.println(fileName.substring(0, (fileName.length()-4)));

//核心函数

//启动word

ActiveXComponent app=new ActiveXComponent("Word.Application");

//要转换的文档的全路径（所在文件夹+文件全名）

String docPath=sourceFilesPath+"\\"+fileName;

//转换后的文档的全路径（所在文件夹+文件名）

String othersPath=destinationFilesPath+"\\"+fileName.substring(0,(fileName.length()-4));

String inFile=docPath;

String outFile=othersPath;

boolean flag=false;

//核心代码

try{

//设置word可见性

app.setProperty("Visible", new Variant(false));

Dispatch docs=app.getProperty("Documents").toDispatch();

//打开word文档

Dispatch doc=Dispatch.invoke(docs, "Open", Dispatch.Method, new Object[]{inFile,new Variant(false),new Variant(true)}, new int[1]).toDispatch();

//另存为指定格式的文档

Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[]{outFile,new Variant(destinationFilesType)}, new int[1]);

Variant file=new Variant(false);

//关闭文档

Dispatch.call(doc, "Close",file);

flag=true;

}catch(Exception e){

e.printStackTrace();

System.out.println("文档转换失败");

}finally{

app.invoke("Quit",new Variant[]{});

}

System.out.println("转换完毕");

}

//文档的扩展名不是doc或docx

else{

System.out.println("非word文档");

}

//如果是子文件夹，则递归遍历，将所有的word文档转换

else{

sourceChildPath=sourceFilesPath;

//该文件是目录

sourceChildPath=sourceChildPath+"\\"+sourceFilesList[i].getName()+"\\";

System.out.println("源文件所在路径："+sourceChildPath);

//修改目标文件夹，保持原来的层级关系

destiNationChildPath=destinationFilesPath;

destiNationChildPath=destinationFilesPath+"\\"+sourceFilesList[i].getName()+"\\";

System.out.println("转换后文件所在路径"+destiNationChildPath);

mkdir(destiNationChildPath);

//递归遍历所有目录，查找word文档，并将其转换

change(sourceChildPath, destiNationChildPath,destinationFilesType);

}

System.out.println("所有文档转换完毕");

}

/**

　* 用于创建文件夹的方法

　* @param mkdirName

public static void mkdir(String mkdirName){

try{

//使用指定的路径创建文件对象

File dirFile = new File(mkdirName);

boolean bFile = dirFile.exists();

//已经存在文件夹，操作？？？提醒是否要替换

if( bFile == true ) {

System.out.println("已经存在文件夹"+mkdirName);

}

//不存在该文件夹，则新建该目录

else{

System.out.println("新建文件夹"+mkdirName);

bFile = dirFile.mkdir();

if( bFile == true ){

System.out.println("文件夹创建成功");

}else{

System.out.println(" 文件夹创建失败，清确认磁盘没有写保护并且空件足够");

System.exit(1);

}

}catch(Exception err){

System.err.println("ELS - Chart : 文件夹创建发生异常");

err.printStackTrace();

}finally{

}

/**

* 判断某个文件夹是否存在

* @param path

public static boolean isPathExist(String path){

boolean isPathExist=false;

try{

File pathFile = new File(path);

if(pathFile.exists())

isPathExist= true;

else

isPathExist= false;

}catch(Exception err){

err.printStackTrace();

}

return isPathExist;

}

/**

* 主函数

public static void main(String[] args){

Scanner sc=new Scanner(System.in);

//源文档所在路径

String sourceFilesPath="";

// String inputSourcePath="";

// boolean sourcePathFlag=true;

// System.out.println("请输入要转换文档所在的文件夹");

// while(sourcePathFlag){

// inputSourcePath=sc.next();

// if(!isPathExist(inputSourcePath))

// System.out.println("源路径不存在，请输入正确的路径");

// else

// sourcePathFlag=false;

// }

// sourceFilesPath=inputSourcePath;

sourceFilesPath="D:\\word";

//目标文档要存放的目录

String destinationFilesPath="";

// String inputdestinationPath="";

// boolean destinationPathFlag=true;

// System.out.println("请输入转换后文档要存放的文件夹");

// while(destinationPathFlag){

// inputdestinationPath=sc.next();

// //目标文件不存在时，是否要提示用户创建文件

// if(!isPathExist(inputdestinationPath))

// System.out.println("目标路径不存在，请输入正确的路径");

// else

// destinationPathFlag=false;

// }

// destinationFilesPath=inputdestinationPath;

destinationFilesPath="D:\\txt";

//选择要转换的类型

int destinationFilesType=0;

int inputNumber=0;

boolean numFlag=true;

System.out.println("您要将word文档转换为哪种文档格式？");

System.out.println("0:doc \t 2:txt \t 8:html \t 9:htm \t 11:xml \t 12:docx \t 17:pdf \t 18:xps");

while(numFlag){

inputNumber=sc.nextInt();

if(inputNumber!=2inputNumber!=8inputNumber!=9inputNumber!=11inputNumber!=12inputNumber!=17){

System.out.println("您的输入有误，请输入要转换的文档类型前的数字");

}else

numFlag=false;

}

destinationFilesType=inputNumber;

//实行转换

change(sourceFilesPath, destinationFilesPath,destinationFilesType);

//测试各种类型转换

// for(int i=0;i25;i++){

// destinationFilesType=i;

// System.out.println("文件类型"+destinationFilesType);

// System.out.println("存放目录："+destinationFilesPath+"\\"+i);

// mkdir(destinationFilesPath+"\\"+i);

// change(sourceFilesPath, destinationFilesPath+"\\"+i,destinationFilesType);

// }

}

这个我刚用的。。格式都能带过来的。你自己再下载个 jacob的包和dll文件

JAVA有什么好的方法可以将word里的文本读取出来

你用免费版的Free Spire.Doc for Java可以直接读取Word文档里面的文本，参考代码：

import com.spire.doc.Document;

import java.io.FileWriter;

import java.io.IOException;

public class ExtractText {

public static void main(String[] args) throws IOException {

//加载Word文档

Document document = new Document();

document.loadFromFile("C:\\Users\\Administrator\\Desktop\\sample.docx");

//获取文档中的文本保存为String

String text=document.getText();

//将String写入Txt文件

writeStringToTxt(text,"ExtractedText.txt");

}

public static void writeStringToTxt(String content, String txtFileName) throws IOException {

FileWriter fWriter= new FileWriter(txtFileName,true);

try {

fWriter.write(content);

}catch(IOException ex){

ex.printStackTrace();

}finally{

try{

fWriter.flush();

fWriter.close();

} catch (IOException ex) {

ex.printStackTrace();

}

参考自官网原文。

java读取doc,pdf问题。

PDFBox是一个开源的对pdf文件进行操作的库。 PDFBox-0.7.3.jar加入classpath。同时FontBox1.0.jar加入classpath，否则报错

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

import org.pdfbox.pdfparser.PDFParser;

import org.pdfbox.pdmodel.PDDocument;

import org.pdfbox.util.PDFTextStripper;

public class PdfReader {

/**

* simply reader all the text from a pdf file.

* You have to deal with the format of the output text by yourself.

* 2008-2-25

* @param pdfFilePath file path

* @return all text in the pdf file

public static String getTextFromPDF(String pdfFilePath)

{

String result = null;

FileInputStream is = null;

PDDocument document = null;

try {

is = new FileInputStream(pdfFilePath);

PDFParser parser = new PDFParser(is);

parser.parse();

document = parser.getPDDocument();

PDFTextStripper stripper = new PDFTextStripper();

result = stripper.getText(document);

} catch (FileNotFoundException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} finally {

if (is != null) {

try {

is.close();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

if (document != null) {

try {

document.close();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

return result;

}

public static void main(String[] args)

{

String str=PdfReader.getTextFromPDF("C:\\Read.pdf");

System.out.println(str);

}

代码2：

import java.io.File;

import java.io.FileOutputStream;

import java.io.OutputStreamWriter;

import java.io.Writer;

import java.net.MalformedURLException;

import java.net.URL;

import org.pdfbox.pdmodel.PDDocument;

import org.pdfbox.util.PDFTextStripper;

public class PDFReader {

public void readFdf(String file) throws Exception {

boolean sort = false;

String pdfFile = file;

String textFile = null;

String encoding = "UTF-8";

int startPage = 1;

int endPage = Integer.MAX_VALUE;

Writer output = null;

PDDocument document = null;

try {

// 首先当作一个URL来装载文件，如果得到异常再从本地文件系统//去装载文件

URL url = new URL(pdfFile);

//注意参数已不是以前版本中的URL.而是File。

document = PDDocument.load(pdfFile);

// 获取PDF的文件名

String fileName = url.getFile();

// 以原来PDF的名称来命名新产生的txt文件

if (fileName.length() 4) {

File outputFile = new File(fileName.substring(0, fileName

.length() - 4)

+ ".txt");

textFile = outputFile.getName();

}

} catch (MalformedURLException e) {

// 如果作为URL装载得到异常则从文件系统装载

//注意参数已不是以前版本中的URL.而是File。

document = PDDocument.load(pdfFile);

if (pdfFile.length() 4) {

textFile = pdfFile.substring(0, pdfFile.length() - 4)

+ ".txt";

}

output = new OutputStreamWriter(new FileOutputStream(textFile),

encoding);

PDFTextStripper stripper = null;

stripper = new PDFTextStripper();

// 设置是否排序

stripper.setSortByPosition(sort);

// 设置起始页

stripper.setStartPage(startPage);

// 设置结束页

stripper.setEndPage(endPage);

// 调用PDFTextStripper的writeText提取并输出文本

stripper.writeText(document, output);

} finally {

if (output != null) {

// 关闭输出流

output.close();

}

if (document != null) {

// 关闭PDF Document

document.close();

}

/**

* @param args

public static void main(String[] args) {

// TODO Auto-generated method stub

PDFReader pdfReader = new PDFReader();

try {

// 取得E盘下的SpringGuide.pdf的内容

pdfReader.readFdf("C:\\Read.pdf");

} catch (Exception e) {

e.printStackTrace();

}

2、抽取支持中文的pdf文件－xpdf

xpdf是一个开源项目，我们可以调用他的本地方法来实现抽取中文pdf文件。

补丁包：

按照readme放好中文的patch，就可以开始写调用本地方法的java程序了。

下面是一个如何调用的例子：

import java.io.*;

/**

* pTitle: pdf extraction/p

* pDescription: email:chris@matrix.org.cn/p

* pCompany: Matrix.org.cn/p

* @author chris

* @version 1.0,who use this example pls remain the declare

public class PdfWin {

public PdfWin() {

}

public static void main(String args[]) throws Exception

{

String PATH_TO_XPDF="C:Program Filesxpdfpdftotext.exe";

String filename="c:a.pdf";

String[] cmd = new String[] { PATH_TO_XPDF, "-enc", "UTF-8", "-q", filename, "-"};

Process p = Runtime.getRuntime().exec(cmd);

BufferedInputStream bis = new BufferedInputStream(p.getInputStream());

InputStreamReader reader = new InputStreamReader(bis, "UTF-8");

StringWriter out = new StringWriter();

char [] buf = new char[10000];

int len;

while((len = reader.read(buf))= 0) {

//out.write(buf, 0, len);

System.out.println("the length is"+len);

}

reader.close();

String ts=new String(buf);

System.out.println("the str is"+ts);

}

如何使用JAVA，POI读写word文档

public class HwpfTest {

@SuppressWarnings("deprecation")

@Test

public void testReadByExtractor() throws Exception {

InputStream is = new FileInputStream("D:\\test.doc");

WordExtractor extractor = new WordExtractor(is);

//输出word文档所有的文本

System.out.println(extractor.getText());

System.out.println(extractor.getTextFromPieces());

//输出页眉的内容

System.out.println("页眉：" + extractor.getHeaderText());

//输出页脚的内容

System.out.println("页脚：" + extractor.getFooterText());

//输出当前word文档的元数据信息，包括作者、文档的修改时间等。

System.out.println(extractor.getMetadataTextExtractor().getText());

//获取各个段落的文本

String paraTexts[] = extractor.getParagraphText();

for (int i=0; iparaTexts.length; i++) {

System.out.println("Paragraph " + (i+1) + " : " + paraTexts[i]);

}

//输出当前word的一些信息

printInfo(extractor.getSummaryInformation());

//输出当前word的一些信息

this.printInfo(extractor.getDocSummaryInformation());

this.closeStream(is);

}

/**

* 输出SummaryInfomation

* @param info

private void printInfo(SummaryInformation info) {

//作者

System.out.println(info.getAuthor());

//字符统计

System.out.println(info.getCharCount());

//页数

System.out.println(info.getPageCount());

//标题

System.out.println(info.getTitle());

//主题

System.out.println(info.getSubject());

}

/**

* 输出DocumentSummaryInfomation

* @param info

private void printInfo(DocumentSummaryInformation info) {

//分类

System.out.println(info.getCategory());

//公司

System.out.println(info.getCompany());

}

/**

* 关闭输入流

* @param is

private void closeStream(InputStream is) {

if (is != null) {

try {

is.close();

} catch (IOException e) {

e.printStackTrace();

}

java如何获得doc文件内容

java 是这样处理的获得一个模板 +数据 = doc文件

反过来 doc文件 + 模板可以得到数据

如果你没有模板，基本上是不行。

Windows 软件

Linux 软件

Mac 软件

安卓软件

各类文章

java读取doc文本,java读取doc文件内容

本文目录一览：

java读取word文件的内容

java读取带格式word内容

JAVA有什么好的方法可以将word里的文本读取出来

java读取doc,pdf问题。

如何使用JAVA，POI读写word文档

java如何获得doc文件内容

java读取doc文本,java读取doc文件内容

java读取word,java读取word内容包括格式

java遍历读取xml文件内容（java读写xml文件）

Python读取Doc文件的多方面分析

PHPWord读取doc详解

我用java读取rtf文档（java读取tif文件）

java读取文件,java读取文件中文乱码

java读取文本,java读取文本数据然后排序

java读取xml,JAVA读取json文件

java读取,java读取excel文件

java打印文件,java读取文件并打印

Java文件读取

JS读取txt文件

javaxml读取,java生成xml文件

使用Java读取文件的方法

java读取文件内容,java读取文件内容并替换

java读二进制文件中结构体,java读取二进制文件内容

Java读取文件内容

Java读取文件内容

java记录代码文档介绍内容（java开发文档示例）

Windows 软件

Linux 软件

Mac 软件

安卓软件

各类文章

java读取doc文本,java读取doc文件内容

本文目录一览：

java读取word文件的内容

java读取带格式word内容

JAVA有什么好的方法可以将word里的文本读取出来

java读取doc,pdf问题。

如何使用JAVA，POI读写word文档

java如何获得doc文件内容

java读取doc文本,java读取doc文件内容

java读取word,java读取word内容包括格式

java遍历读取xml文件内容（java读写xml文件）

Python读取Doc文件的多方面分析

PHPWord读取doc详解

我用java读取rtf文档（java读取tif文件）

java读取文件,java读取文件中文乱码

java读取文本,java读取文本数据然后排序

java读取xml,JAVA读取json文件

java读取,java读取excel文件

java打印文件,java读取文件并打印

Java文件读取

JS读取txt文件

javaxml读取,java生成xml文件

使用Java读取文件的方法

java读取文件内容,java读取文件内容并替换

java读二进制文件中结构体,java读取二进制文件内容

Java读取文件内容

Java读取文件内容

java记录代码文档介绍内容（java开发文档示例）

人机检测，请谅解