Pyspark,联接,sql,外部联接

问题描述

我有这样的要求...我想外部连接两个表A和B(例如),以便如果键匹配,我的输出应具有表B的值(而不是表A的列值) 例如

A
a b
1 abc
2 fgh
3 xyz

B
a b
1 wer
6 uio

输出

a b
1 wer
2 fgh
3 xyz
6 uio

解决方法

这是优先级查询。您似乎想要基于第一列的import io import sys import folium from folium.plugins.draw import Draw from PyQt5.QtWidgets import QApplication,QFileDialog,QVBoxLayout,QWidget from PyQt5.QtWebEngineWidgets import QWebEngineView class Mapy(QWidget): def __init__(self,parent=None): super().__init__(parent) self.interfejs() def interfejs(self): vbox = QVBoxLayout(self) self.webEngineView = QWebEngineView() self.webEngineView.page().profile().downloadRequested.connect( self.handle_downloadRequested ) self.loadPage() vbox.addWidget(self.webEngineView) self.setLayout(vbox) self.setGeometry(300,300,350,250) self.setWindowTitle("mapy") self.show() def loadPage(self): m = folium.Map(location=[51.7687323,19.4569911],zoom_start=5) Draw( export=True,filename="my_data.geojson",position="topleft",draw_options={ "polyline": False,"rectangle": False,"circle": False,"circlemarker": False,},edit_options={"poly": {"allowIntersection": False}},).add_to(m) data = io.BytesIO() m.save(data,close_file=False) self.webEngineView.setHtml(data.getvalue().decode()) def handle_downloadRequested(self,item): path,_ = QFileDialog.getSaveFileName( self,"Save File",item.suggestedFileName() ) if path: item.setPath(path) item.accept() if __name__ == "__main__": app = QApplication(sys.argv) okno = Mapy() sys.exit(app.exec_()) 中的所有行,然后是b中的不匹配行。

一种方法是a

union all
,

Pyspark解决方案是使用full联接和coalesce

from pyspark.sql import functions as F

# Create dataframes
A = spark.createDataFrame(data=[[1,'abc'],[2,'fgh'],[3,'xyz']],schema=['a','b'])
B = spark.createDataFrame(data=[[1,'wer'],[6,'uio']],'b'])

# Rename column `b` to prevent naming collision 
A = A.select('a',F.col('b').alias('b_a'))
B = B.select('a',F.col('b').alias('b_b'))

# Full join on `a` keeps all entries from both dataframes
combined = A.join(B,on='a',how='full')

# Coalesce takes value from `b_b` if not null and `b_a` otherwise
combined = combined.withColumn('b',F.coalesce('b_b','b_a'))

# Drop unneeded helper columns
combined = combined.drop('b_b','b_a')

combined.show()

结果

+---+---+
|  a|  b|
+---+---+
|  1|wer|
|  2|fgh|
|  3|xyz|
|  6|uio|
+---+---+