-
C#教程之C# Net 使用openxml提取word中的文本和图片并转为Html
本站最新发布 C#从入门到精通
试听地址 https://www.xin3721.com/eschool/CSharpxin3721/
试听地址 https://www.xin3721.com/eschool/CSharpxin3721/
C# Net Core openxml 提取 提出 取 word 文本 图片 Html Text Drawing
C# Net Core openxml 提取 提出 取 word 文本 图片 Html Text Drawing
注:只支持内嵌,不支持公式
------------------------------------------------
---------------文章最后为效果------------
------------------------------------------------
加入包:OpenXml
创建文件:Read.cs
复制下面全部代码到文件 Read.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
|
using DocumentFormat.OpenXml; using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Wordprocessing; using System; using System.Collections.Generic; using System.IO; using System.Text; using System.Xml; using System.Xml.Xsl; namespace YCBX.Office.WordXml { public class WordRead { public static List< string > ReadToHtml( string wordPathStr) { return ReadToHtml( new FileStream(wordPathStr, FileMode.Open)); } public static List< string > ReadToHtml(Stream wordStream) { using (WordprocessingDocument doc = WordprocessingDocument.Open(wordStream, false )) { //XmlWriterSettings settings = new XmlWriterSettings() { OmitXmlDeclaration = true, ConformanceLevel = ConformanceLevel.Auto,DoNotEscapeUriAttributes=true}; List< string > paragraphHtmls = new List< string >(); MainDocumentPart mainPart = doc.MainDocumentPart; Body body = doc.MainDocumentPart.Document.Body; //段落 foreach ( var paragraph in body.Elements<Paragraph>()) { StringBuilder paragraphHtml = new StringBuilder(); //块 foreach ( var run in paragraph.ChildElements) { if (run is Run) { foreach (OpenXmlElement openXmlElement in run.Elements()) { //软回车 if (openXmlElement is Break br) { paragraphHtmls.Add(paragraphHtml.ToString()); paragraphHtml = new StringBuilder(); } //文字块 else if (openXmlElement is Text text) { paragraphHtml.Append(text.Text); } //图像块 else if (openXmlElement is Drawing drawing) { //得到图像的内嵌ID(外嵌没做处理) var inline = drawing.Inline; var extent = inline.Extent; var pic = inline.Graphic.GraphicData.GetFirstChild<DocumentFormat.OpenXml.Drawing.Pictures.Picture>(); var embed = pic.BlipFill.Blip.Embed.Value; //得到图像流 var part = mainPart.GetPartById(embed); var stream = part.GetStream(); //流转2进制 byte [] bytes = new byte [stream.Length]; stream.Read(bytes, 0, bytes.Length); //2进制转base64 string imgHtml = $ "<img width='{ImageExtent.EMU_TO_PX((decimal)extent.Cx.Value).ToString(" 0. ")}' height='{ImageExtent.EMU_TO_PX((decimal)extent.Cy.Value).ToString(" 0. ")}' src='data:{part.ContentType};base64," + Convert.ToBase64String(bytes) + "' />" ; paragraphHtml.Append(imgHtml); } } } //else if(run is DocumentFormat.OpenXml.Math.OfficeMath math) //{ // var x = new XmlDocument(); // x.LoadXml(math.OuterXml); // using var ms = ConvertToMatchMl(x, settings); // paragraphHtml.Append(ConvertToLatex(settings, ms)); //} } paragraphHtmls.Add(paragraphHtml.ToString()); } return paragraphHtmls; } } /// <summary> /// 合并文档 /// </summary> /// <param name="finalFile"></param> /// <param name="files"></param> public static void Combine( string finalFile, List< string > files) { if (files.Count < 2) { return ; } File.Copy(files[0], finalFile, true ); using (WordprocessingDocument doc = WordprocessingDocument.Open(finalFile, true )) { Body b = doc.MainDocumentPart.Document.Body; for ( int i = 1; i < files.Count; i++) { using (WordprocessingDocument doc1 = WordprocessingDocument.Open(files[i], true )) { foreach ( var inst in doc1.MainDocumentPart.Document.Body.Elements()) { b.Append(inst.CloneNode( true )); } } } } } private string ConvertToLatex(XmlWriterSettings settings, Stream ms) { var latexTransform = new XslCompiledTransform(); latexTransform.Load(Path.Combine(AppContext.BaseDirectory, "xsltml" , "mmltex.xsl" ), new XsltSettings( true , true ), new XmlUrlResolver() ); using var la = new MemoryStream(); latexTransform.Transform( new XmlTextReader(ms), XmlWriter.Create(la, settings)); la.Seek(0, SeekOrigin.Begin); StreamReader sr = new StreamReader(la, Encoding.UTF8); return sr.ReadToEnd(); } private Stream ConvertToMatchMl(XmlDocument xmlDocument, XmlWriterSettings settings) { var ms = new MemoryStream(); var xslTransform = new XslCompiledTransform(); xslTransform.Load(Path.Combine(AppContext.BaseDirectory, "xsltml" , "OMML2MML.XSL" )); xslTransform.Transform(xmlDocument, XmlWriter.Create(ms, settings)); ms.Seek(0, SeekOrigin.Begin); return ms; } } } |
栏目列表
最新更新
C# 面向对象
假设客车的座位数是9行4列,使用二维数
C#基于接口设计三层架构Unity篇
C#线程 入门
C#读取静态类常量属性和值
C# 插件式编程
C# 委托与事件有啥区别?
C#队列学习笔记:队列(Queue)和堆栈(Stack
linq 多表分组左连接查询查询统计
C#队列学习笔记:MSMQ入门一
C# 在Word中添加Latex 数学公式和符号
inncheck命令 – 检查语法
基于UDP的服务器端和客户端
再谈UDP和TCP
在socket编程中使用域名
网络数据传输时的大小端问题
socket编程实现文件传输功能
如何优雅地断开TCP连接?
图解TCP四次握手断开连接
详细分析TCP数据的传输过程
SqlServer 利用游标批量更新数据
BOS只读状态修改
SQL Server等待事件—PAGEIOLATCH_EX
数据库多行转换为单一列
获取数据表最后最后访问,修改,更新,
计算经历的时间
SQL查询结果自定义排序
修改数据库默认位置
日期简单加或减
从日期获取年,月或日