打开APP
userphoto
未登录

开通VIP,畅享免费电子书等14项超值服

开通VIP
Spark Python API函数学习:pyspark API(3) – 过往记忆

histogram

01# histogram (example #1)
02x = sc.parallelize([1,3,1,2,3])
03y = x.histogram(buckets = 2)
04print(x.collect())
05print(y)
06 
07[1, 3, 1, 2, 3]
08([1, 2, 3], [2, 3])
09 
10# histogram (example #2)
11x = sc.parallelize([1,3,1,2,3])
12y = x.histogram([0,0.5,1,1.5,2,2.5,3,3.5])
13print(x.collect())
14print(y)
15 
16[1, 3, 1, 2, 3]
17([0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5], [0, 0, 2, 0, 1, 0, 2])

mean

1# mean
2x = sc.parallelize([1,3,2])
3y = x.mean()
4print(x.collect())
5print(y)
6 
7[1, 3, 2]
82.0

variance

1# variance
2x = sc.parallelize([1,3,2])
3y = x.variance()  # divides by N
4print(x.collect())
5print(y)
6[1, 3, 2]
70.666666666667

stdev

1# stdev
2x = sc.parallelize([1,3,2])
3y = x.stdev()  # divides by N
4print(x.collect())
5print(y)
6 
7[1, 3, 2]
80.816496580928

sampleStdev

1# sampleStdev
2x = sc.parallelize([1,3,2])
3y = x.sampleStdev() # divides by N-1
4print(x.collect())
5print(y)
6[1, 3, 2]
71.0

sampleVariance

1# sampleVariance
2x = sc.parallelize([1,3,2])
3y = x.sampleVariance()  # divides by N-1
4print(x.collect())
5print(y)
6 
7[1, 3, 2]
81.0

countByValue

1# countByValue
2x = sc.parallelize([1,3,1,2,3])
3y = x.countByValue()
4print(x.collect())
5print(y)
6 
7[1, 3, 1, 2, 3]
8defaultdict(<type 'int'>, {1: 2, 2: 1, 3: 2})

top

1# top
2x = sc.parallelize([1,3,1,2,3])
3y = x.top(num = 3)
4print(x.collect())
5print(y)
6 
7[1, 3, 1, 2, 3]
8[3, 3, 2]

takeOrdered

1# takeOrdered
2x = sc.parallelize([1,3,1,2,3])
3y = x.takeOrdered(num = 3)
4print(x.collect())
5print(y)
6 
7[1, 3, 1, 2, 3]
8[1, 1, 2]

take

1# take
2x = sc.parallelize([1,3,1,2,3])
3y = x.take(num = 3)
4print(x.collect())
5print(y)
6 
7[1, 3, 1, 2, 3]
8[1, 3, 1]

first

1# first
2x = sc.parallelize([1,3,1,2,3])
3y = x.first()
4print(x.collect())
5print(y)
6 
7[1, 3, 1, 2, 3]
81

collectAsMap

1# collectAsMap
2x = sc.parallelize([('C',3),('A',1),('B',2)])
3y = x.collectAsMap()
4print(x.collect())
5print(y)
6 
7[('C', 3), ('A', 1), ('B', 2)]
8{'A': 1, 'C': 3, 'B': 2}

keys

1# keys
2x = sc.parallelize([('C',3),('A',1),('B',2)])
3y = x.keys()
4print(x.collect())
5print(y.collect())
6 
7[('C', 3), ('A', 1), ('B', 2)]
8['C', 'A', 'B']

values

1# values
2x = sc.parallelize([('C',3),('A',1),('B',2)])
3y = x.values()
4print(x.collect())
5print(y.collect())
6 
7[('C', 3), ('A', 1), ('B', 2)]
8[3, 1, 2]

reduceByKey

1# reduceByKey
2x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
3y = x.reduceByKey(lambda agg, obj: agg + obj)
4print(x.collect())
5print(y.collect())
6 
7[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]
8[('A', 12), ('B', 3)]

reduceByKeyLocally

1# reduceByKeyLocally
2x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
3y = x.reduceByKeyLocally(lambda agg, obj: agg + obj)
4print(x.collect())
5print(y)
6 
7[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]
8{'A': 12, 'B': 3}
本站仅提供存储服务,所有内容均由用户发布,如发现有害或侵权内容,请点击举报
打开APP,阅读全文并永久保存 查看更多类似文章
猜你喜欢
类似文章
【热】打开小程序,算一算2024你的财运
如何使用pyspark统计词频?
全面解析Spark,以及和Python的对接
大数据入门与实战-PySpark的使用教程
Spark笔记:RDD基本操作(上)
大数据学习路线分享弹性分布式数据集RDD
Spark2_RDD编程、KeyValue Pairs、data分区
更多类似文章 >>
生活服务
热点新闻
分享 收藏 导长图 关注 下载文章
绑定账号成功
后续可登录账号畅享VIP特权!
如果VIP功能使用有故障,
可点击这里联系客服!

联系客服