问题描述
我创建了一个脚本,用于在一个文件夹中查找 pdf 中的单词,然后如果找到它会将 pdf 移动到另一个文件夹。
from pathlib import Path
import PyPDF2
import re
import os
import shutil
pattern = input("Enter string pattern to search: ")
basepath = Path('\hrdinhal\Data\Desktop\Analize\Search engine')
src = basepath / 'Folder 1'
dst = basepath / 'Folder 2'
for file_name in os.scandir(src):
file = PyPDF2.PdfFileReader(str(src / file_name),'rb')
numPages = file.getNumPages()
for i in range(0,numPages):
pageObj = file.getPage(i)
text = pageObj.extractText()
for match in re.findall(pattern,text,re.IGnorECASE):
shutil.copyfile(str(src / file_name),str(dst / file_name))
当我运行它时出现错误:
SameFileError: '\\hrdinhal\\Data\\Desktop\\Analize\\Search engine\\Folder 1\\Daily Production Summary 1.pdf' and '\\hrdinhal\\Data\\Desktop\\Analize\\Search engine\\Folder 1\\Daily Production Summary 1.pdf' are the same file
出于某种原因,它需要 dst 并用 src 替换它。为什么?以及如何修复它?
dst
Out[99]: WindowsPath('/hrdinhal/Data/Desktop/Analize/Search engine/Folder 2')
file_name
Out[100]: <DirEntry 'Daily Production Summary 1.pdf'>
dst/file_name
Out[101]: WindowsPath('/hrdinhal/Data/Desktop/Analize/Search engine/Folder 1/Daily Production Summary 1.pdf')
解决方法
我发现 file_name
保留了文件的完整路径,此路径替换了
src
、dst
src / file_name
dst / file_name
您只需要获得名称 file_name.name
src / file_name.name
dst / file_name.name
顺便说一句:
完整路径
print( file_name.path )
只有文件名
print( file_name.name )
顺便说一句:你每次比赛后都复制同一个文件,但你只能做一次
使用变量 found
并在 for i
循环后复制
from pathlib import Path
import PyPDF2
import re
import os
import shutil
pattern = input("Enter string pattern to search: ")
basepath = Path('\hrdinhal\Data\Desktop\Analize\Search engine')
src = basepath / 'Folder 1'
dst = basepath / 'Folder 2'
#print('[DEBUG] (before for file_name) src:',src)
for file_name in os.scandir(src):
file = PyPDF2.PdfFileReader(str(src / file_name.name),'rb')
numPages = file.getNumPages()
found = False
# ---
#print('[DEBUG] (before for i) src:',src)
for i in range(0,numPages):
pageObj = file.getPage(i)
text = pageObj.extractText()
#print('[DEBUG] (before if re) src:',src)
if re.findall(pattern,text,re.IGNORECASE):
found = True
# ----
#print('[DEBUG] (before for found) src:',src)
if found:
#print('[DEBUG] (before copy) src:',src)
shutil.copyfile(str(src / file_name.name),str(dst / file_name.name))
或在第一次复制后使用 break
跳过 for i
循环
from pathlib import Path
import PyPDF2
import re
import os
import shutil
pattern = input("Enter string pattern to search: ")
basepath = Path('\hrdinhal\Data\Desktop\Analize\Search engine')
src = basepath / 'Folder 1'
dst = basepath / 'Folder 2'
#print('[DEBUG] (before for file_name) src:',src)
for file_name in os.scandir(src):
#print('[DEBUG] (before pyPDF2) file_name:',file_name)
file = PyPDF2.PdfFileReader(str(src / file_name.name),'rb')
numPages = file.getNumPages()
# ---
#print('[DEBUG] (before for i) src:',re.IGNORECASE):
#print('[DEBUG] (before copy) src:',src)
shutil.copyfile(str(src / file_name.name),str(dst / file_name.name))
break # there is no need to check rest of PDF