使用Python处理Word文件

安装外部模块python-docx

pip install python-docx
1

1. 从Python看Word文件结构

在python-docx模块中，将Word文件结构分成3层：

Document：最高层，代表整个Word文件。
Paragraph：一个Word文件由许多段落组成，在Python中，整份文件的定义是Document，这些段落的定义就是Paragraph对象。在Python中，一个段落代表一个
Paragraph对象，所有段落以Paragraph对象列表方式存在。
Run：Word文件要考虑的有字号、字体样式、色彩等，统称为样式。一个Run对象指的是Paragraph对象中相同样式的连续文字，如果文字发生样式变化，Python将以新的Run对象代表。

2. 读取Word文件内容

# author:mlnt
# createdate:2022/8/15
import docx  # 导入docx模块

# 1.创建docx对象
document = docx.Document('test.docx')

# 2.获得Paragraph和Run数量
# 使用len()方法获得Paragraph数量
paragraph_count = len(document.paragraphs)
print(f'段落数：{paragraph_count}')
for i in range(0, paragraph_count):
    # 获取Paragraph的Run数量
    paragraph_run_count = len(document.paragraphs[i].runs)  # i为Paragraph编号
    print(document.paragraphs[i].text)  # 打印Paragraph内容
    print(document.paragraphs[i].runs[i].text)  # 打印第i段第i个Run内容


def getFile(filename):
    """读取文件与适度编辑文件"""
    document = docx.Document(filename)  # 建立Word文件对象
    content = []
    for paragraph in document.paragraphs:
        print(paragraph.text)  # 输出文件所读取的Paragraph内容
        content.append(paragraph.text)  # 将每一段Paragraph组成列表
    return '\n\n'.join(content)  # 将列表转成字符串并隔行输出


print(getFile('test.docx'))
# 存储文件
document.save('out_test.docx')  # 将文件复制到新文件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

test.docx：
在这里插入图片描述

out_test.docx
在这里插入图片描述

3. 创建文件内容

创建docx对象

# 1.创建docx对象
document = docx.Document()
1
2

设置页面

# 设置页眉
run_header = document.sections[0].header.paragraphs[0].add_run("test")
document.sections[0].header.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.CENTER  # 居中对齐
1
2
3

添加标题

# 2.添加标题
"""
add_heading()：建立标题
- document.add_heading('content_of_heading', level=n)
"""
document.add_heading('侠客行', level=1)  # 标题1格式
document.add_heading('李白', level=2)   # 标题2格式
1
2
3
4
5
6
7

添加段落

# 3.添加段落
# 创建段落对象
"""
add_paragraph()：建立段落Paragraph内容
- document.add_paragraph('paragraph_content')
"""
paragraph_object = document.add_paragraph('赵客缦胡缨，吴钩霜雪明。')
document.add_paragraph('银鞍照白马，飒沓如流星。')
document.add_paragraph('十步杀一人，千里不留行。')
document.add_paragraph('事了拂衣去，深藏身与名。')
document.add_paragraph('闲过信陵饮，脱剑膝前横。')
document.add_paragraph('将炙啖朱亥，持觞劝侯嬴。')
document.add_paragraph('三杯吐然诺，五岳倒为轻。')
document.add_paragraph('眼花耳热后，意气素霓生。')
document.add_paragraph('救赵挥金槌，邯郸先震惊。')
document.add_paragraph('千秋二壮士，烜赫大梁城。')
document.add_paragraph('纵死侠骨香，不惭世上英。')
document.add_paragraph('谁能书阁下，白首太玄经。')
prior_paragraph_object = paragraph_object.insert_paragraph_before('')  # 在paragraph前插入新段落
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

建立Run内容，设置样式

# 4.建立Run内容
"""
Paragraph是由Run组成，使用add_run()方法可以在Paragraph中插入内容，语法如下：
paragraph_object.add_run('run_content')
"""
run1 = prior_paragraph_object.add_run('*'*13)
run2 = prior_paragraph_object.add_run('%'*13)
# 设置Run的样式
"""
bold: 加粗
italic：斜体
underline：下划线
strike：删除线
"""
run1.bold = True
run2.underline = True

# 设置段落居中对齐
for i in range(len(document.paragraphs)):
    document.paragraphs[i].alignment = WD_PARAGRAPH_ALIGNMENT.CENTER  # 居中对齐
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

添加换页符

# 5.添加换页符
# add_page_break()
document.add_page_break()
1
2
3

插入图片

# 6.插入图片
# add_picture()，调整图片宽高需导入docx.shared模块
document.add_picture('libai.jpeg', width=Pt(200), height=Pt(300))

# 设置居中对齐
document.paragraphs[len(document.paragraphs)-1].alignment = WD_PARAGRAPH_ALIGNMENT.CENTER  # 居中对齐
1
2
3
4
5
6

创建表格，添加数据并设置简单样式

# 7.创建表格
"""
add_table(rows=n, cols=m)
"""
table = document.add_table(rows=2, cols=5)
# 添加表格内容
# 添加第1行数据
row = table.rows[0]
row.cells[0].text = '姓名'
row.cells[1].text = '字'
row.cells[2].text = '号'
row.cells[3].text = '所处时代'
row.cells[4].text = '别称'
# 添加第2行数据
row = table.rows[1]
row.cells[0].text = '李白'
row.cells[1].text = '太白'
row.cells[2].text = '青莲居士'
row.cells[3].text = '唐朝'
row.cells[4].text = '诗仙'

# 插入行
new_row = table.add_row()  # 增加表格行
new_row.cells[0].text = '白居易'
new_row.cells[1].text = '乐天'
new_row.cells[2].text = '香山居士'
new_row.cells[3].text = '唐朝'
new_row.cells[4].text = '诗魔'

# 插入列
new_column = table.add_column(width=Inches(1))  # 增加表格列
new_column.cells[0].text = '代表作'
new_column.cells[1].text = '《侠客行》、《静夜思》'
new_column.cells[2].text = '《长恨歌》、《琵琶行》'

# 计算表格的rows和cols的长度
rows = len(table.rows)
cols = len(table.columns)
print(f'rows: {rows}')
print(f'columns: {cols}')

# 打印表格内容
# for row in table.rows:
#     for cell in row.cells:
#         print(cell.text)

# 设置表格样式
# table.style = 'LightShading-Accent1'
# UserWarning: style lookup by style_id is deprecated. Use style name as key instead.
table.style = 'Light Shading Accent 1'
# 循环将每一行，每一列都设置为居中
for r in range(rows):
    for c in range(cols):
        table.cell(r, c).vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER  # 垂直居中
        table.cell(r, c).paragraphs[0].paragraph_format.alignment = WD_TABLE_ALIGNMENT.CENTER  # 水平居中
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

设置页码并保存

# 设置页码
add_page_number(document.sections[0].footer.paragraphs[0])
# 保存文件
document.save('test2.docx')
1
2
3
4

设置页码的代码(page_num.py)

from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.oxml import OxmlElement, ns


def create_element(name):
    return OxmlElement(name)


def create_attribute(element, name, value):
    element.set(ns.qn(name), value)


def add_page_number(paragraph):
    paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER

    page_run = paragraph.add_run()
    t1 = create_element('w:t')
    create_attribute(t1, 'xml:space', 'preserve')
    t1.text = 'Page '
    page_run._r.append(t1)

    page_num_run = paragraph.add_run()

    fldChar1 = create_element('w:fldChar')
    create_attribute(fldChar1, 'w:fldCharType', 'begin')

    instrText = create_element('w:instrText')
    create_attribute(instrText, 'xml:space', 'preserve')
    instrText.text = "PAGE"

    fldChar2 = create_element('w:fldChar')
    create_attribute(fldChar2, 'w:fldCharType', 'end')

    page_num_run._r.append(fldChar1)
    page_num_run._r.append(instrText)
    page_num_run._r.append(fldChar2)

    of_run = paragraph.add_run()
    t2 = create_element('w:t')
    create_attribute(t2, 'xml:space', 'preserve')
    t2.text = ' of '
    of_run._r.append(t2)

    fldChar3 = create_element('w:fldChar')
    create_attribute(fldChar3, 'w:fldCharType', 'begin')

    instrText2 = create_element('w:instrText')
    create_attribute(instrText2, 'xml:space', 'preserve')
    instrText2.text = "NUMPAGES"

    fldChar4 = create_element('w:fldChar')
    create_attribute(fldChar4, 'w:fldCharType', 'end')

    num_pages_run = paragraph.add_run()
    num_pages_run._r.append(fldChar3)
    num_pages_run._r.append(instrText2)
    num_pages_run._r.append(fldChar4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58

完整代码

import docx
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_CELL_VERTICAL_ALIGNMENT
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import Pt, Inches
from page_num import add_page_number

# 1.创建docx对象
document = docx.Document()

# 设置页眉
run_header = document.sections[0].header.paragraphs[0].add_run("test")
document.sections[0].header.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.CENTER  # 居中对齐
print(len(document.sections))

# 2.添加标题
"""
add_heading()：建立标题
- document.add_heading('content_of_heading', level=n)
"""
document.add_heading('侠客行', level=1)  # 标题1格式
document.add_heading('李白', level=2)   # 标题2格式

# 3.添加段落
# 创建段落对象
"""
add_paragraph()：建立段落Paragraph内容
- document.add_paragraph('paragraph_content')
"""
paragraph_object = document.add_paragraph('赵客缦胡缨，吴钩霜雪明。')
document.add_paragraph('银鞍照白马，飒沓如流星。')
document.add_paragraph('十步杀一人，千里不留行。')
document.add_paragraph('事了拂衣去，深藏身与名。')
document.add_paragraph('闲过信陵饮，脱剑膝前横。')
document.add_paragraph('将炙啖朱亥，持觞劝侯嬴。')
document.add_paragraph('三杯吐然诺，五岳倒为轻。')
document.add_paragraph('眼花耳热后，意气素霓生。')
document.add_paragraph('救赵挥金槌，邯郸先震惊。')
document.add_paragraph('千秋二壮士，烜赫大梁城。')
document.add_paragraph('纵死侠骨香，不惭世上英。')
document.add_paragraph('谁能书阁下，白首太玄经。')
prior_paragraph_object = paragraph_object.insert_paragraph_before('')  # 在paragraph前插入新段落
# 4.建立Run内容
"""
Paragraph是由Run组成，使用add_run()方法可以在Paragraph中插入内容，语法如下：
paragraph_object.add_run('run_content')
"""
run1 = prior_paragraph_object.add_run('*'*13)
run2 = prior_paragraph_object.add_run('%'*13)
# 设置Run的样式
"""
bold: 加粗
italic：斜体
underline：下划线
strike：删除线
"""
run1.bold = True
run2.underline = True

# 设置段落居中对齐
for i in range(len(document.paragraphs)):
    document.paragraphs[i].alignment = WD_PARAGRAPH_ALIGNMENT.CENTER  # 居中对齐

# 5.添加换页符
# add_page_break()
document.add_page_break()
# print(len(document.paragraphs))
# 6.插入图片
# add_picture()，调整图片宽高需导入docx.shared模块
document.add_picture('libai.jpeg', width=Pt(200), height=Pt(300))

# 设置居中对齐
document.paragraphs[len(document.paragraphs)-1].alignment = WD_PARAGRAPH_ALIGNMENT.CENTER  # 居中对齐

# 7.创建表格
"""
add_table(rows=n, cols=m)
"""
table = document.add_table(rows=2, cols=5)
# 添加表格内容
# 添加第1行数据
row = table.rows[0]
row.cells[0].text = '姓名'
row.cells[1].text = '字'
row.cells[2].text = '号'
row.cells[3].text = '所处时代'
row.cells[4].text = '别称'
# 添加第2行数据
row = table.rows[1]
row.cells[0].text = '李白'
row.cells[1].text = '太白'
row.cells[2].text = '青莲居士'
row.cells[3].text = '唐朝'
row.cells[4].text = '诗仙'

# 插入行
new_row = table.add_row()  # 增加表格行
new_row.cells[0].text = '白居易'
new_row.cells[1].text = '乐天'
new_row.cells[2].text = '香山居士'
new_row.cells[3].text = '唐朝'
new_row.cells[4].text = '诗魔'

# 插入列
new_column = table.add_column(width=Inches(1))  # 增加表格列
new_column.cells[0].text = '代表作'
new_column.cells[1].text = '《侠客行》、《静夜思》'
new_column.cells[2].text = '《长恨歌》、《琵琶行》'

# 计算表格的rows和cols的长度
rows = len(table.rows)
cols = len(table.columns)
print(f'rows: {rows}')
print(f'columns: {cols}')

# 打印表格内容
# for row in table.rows:
#     for cell in row.cells:
#         print(cell.text)

# 设置表格样式
# table.style = 'LightShading-Accent1'
# UserWarning: style lookup by style_id is deprecated. Use style name as key instead.
table.style = 'Light Shading Accent 1'
# 循环将每一行，每一列都设置为居中
for r in range(rows):
    for c in range(cols):
        table.cell(r, c).vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER  # 垂直居中
        table.cell(r, c).paragraphs[0].paragraph_format.alignment = WD_TABLE_ALIGNMENT.CENTER  # 水平居中

# 设置页码
add_page_number(document.sections[0].footer.paragraphs[0])
# 保存文件
document.save('test2.docx')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

效果：
在这里插入图片描述

参考：

相关阅读:
【RocketMQ 十】RocketMQ工作原理之消息生产及存储
 10款小而美的macOS应用
 记录如何用php将敏感文字内容替换为星号的方法
 长江智城智慧平台框架的规划与实践
 CentOS7日志文件及journalctl日志查看
 2023.10月面试题目
 Spring MVC实现RESTful
SC-RoadDeepNet学习笔记（持续更新）
unity无法激活认证、无法保存许可证、及unity package manager Error
jenkins如何请求http接口及乱码问题解决
原文地址：https://blog.csdn.net/username666/article/details/126356507