问题描述
我想通过python模块hdfs
将文件上传到我的hdfs。我的 hadoop 在 docker-compose 中运行,我想上传大约 800,000 个文件到 hdfs。
代码就像
#!/home/fengnx/anaconda3/bin/python
from tqdm import tqdm
from hdfs import *
import os
class pyhdfs(object):
"""docstring for pyhdfs"""
def __init__(self,ip,port):
self.ip = ip
self.port = port
self.url = 'http://'+self.ip+':'+self.port
self.client = Client(self.url)
def mkdir(self,dirname,permission=''):
if permission:
permission = int(permission)
else:
permission = 644
self.client.makedirs(dirname,permission = permission)
def rmdir(self,dirname):
self.client.delete(dirname,True)
def upload(self,filename):
self.client.upload(dirname,filename)
def download(self,filename,download_filename):
self.client.download(filename,download_filename)
def cat(self,filename):
with self.client.read(filename) as f:
return f.read()
class file_file(object):
"""docstring for file_file"""
def __init__(self,work_dir):
self.work_dir=work_dir
self.all_files = []
def get_all_files(self):
for parent,dirnames,filenames in os.walk(self.work_dir):
for filename in filenames:
file_path = os.path.join(parent,filename)
self.all_files.append(file_path)
return self.all_files
if __name__ == '__main__':
a = pyhdfs('127.0.0.1','9870')
a.mkdir('/ddbs')
files = file_file('../articles')
all_files = files.get_all_files()
for file in tqdm(all_files):
#print(file)
a.upload('/ddbs',file)
但我得到了 urllib3.exceptions.NewConnectionError: <urllib3.connection.httpconnection object at 0x7fd9844da7f0>: Failed to establish a new connection: [Errno 22] Invalid argument
的 a.upload('/ddbs',file)
。
docker-compose.yml 是:
version: "3"
services:
namenode:
image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
container_name: namenode
restart: always
ports:
- 9870:9870
- 9000:9000
volumes:
- hadoop_namenode:/hadoop/dfs/name
environment:
- CLUSTER_NAME=test
env_file:
- ./hadoop.env
networks:
hadoopnet:
ipv4_address: 172.21.0.2
datanode1:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
container_name: datanode1
restart: always
hostname: 172.21.0.3
volumes:
- hadoop_datanode1:/hadoop/dfs/data
environment:
SERVICE_PRECONDITION: "namenode:9870"
env_file:
- ./hadoop.env
networks:
hadoopnet:
ipv4_address: 172.21.0.3
datanode2:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
container_name: datanode2
restart: always
hostname: 172.21.0.4
volumes:
- hadoop_datanode2:/hadoop/dfs/data
environment:
SERVICE_PRECONDITION: "namenode:9870"
env_file:
- ./hadoop.env
networks:
hadoopnet:
ipv4_address: 172.21.0.4
resourcemanager:
image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8
container_name: resourcemanager
restart: always
environment:
SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864"
env_file:
- ./hadoop.env
networks:
hadoopnet:
ipv4_address: 172.21.0.5
nodemanager1:
image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8
container_name: nodemanager
restart: always
environment:
SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088"
env_file:
- ./hadoop.env
networks:
hadoopnet:
ipv4_address: 172.21.0.6
historyserver:
image: bde2020/hadoop-historyserver:2.0.0-hadoop3.2.1-java8
container_name: historyserver
restart: always
environment:
SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088"
volumes:
- hadoop_historyserver:/hadoop/yarn/timeline
env_file:
- ./hadoop.env
networks:
hadoopnet:
ipv4_address: 172.21.0.7
volumes:
hadoop_namenode:
hadoop_datanode1:
hadoop_datanode2:
hadoop_historyserver:
networks:
hadoopnet:
driver: bridge
ipam:
driver: default
config:
-
subnet: 172.21.0.0/24
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)