01 | # histogram (example #1) |
02 | x = sc.parallelize([ 1 , 3 , 1 , 2 , 3 ]) |
03 | y = x.histogram(buckets = 2 ) |
04 | print (x.collect()) |
05 | print (y) |
06 |
07 | [ 1 , 3 , 1 , 2 , 3 ] |
08 | ([ 1 , 2 , 3 ], [ 2 , 3 ]) |
09 |
10 | # histogram (example #2) |
11 | x = sc.parallelize([ 1 , 3 , 1 , 2 , 3 ]) |
12 | y = x.histogram([ 0 , 0.5 , 1 , 1.5 , 2 , 2.5 , 3 , 3.5 ]) |
13 | print (x.collect()) |
14 | print (y) |
15 |
16 | [ 1 , 3 , 1 , 2 , 3 ] |
17 | ([ 0 , 0.5 , 1 , 1.5 , 2 , 2.5 , 3 , 3.5 ], [ 0 , 0 , 2 , 0 , 1 , 0 , 2 ]) |
1 | # mean |
2 | x = sc.parallelize([ 1 , 3 , 2 ]) |
3 | y = x.mean() |
4 | print (x.collect()) |
5 | print (y) |
6 |
7 | [ 1 , 3 , 2 ] |
8 | 2.0 |
1 | # variance |
2 | x = sc.parallelize([ 1 , 3 , 2 ]) |
3 | y = x.variance() # divides by N |
4 | print (x.collect()) |
5 | print (y) |
6 | [ 1 , 3 , 2 ] |
7 | 0.666666666667 |
1 | # stdev |
2 | x = sc.parallelize([ 1 , 3 , 2 ]) |
3 | y = x.stdev() # divides by N |
4 | print (x.collect()) |
5 | print (y) |
6 |
7 | [ 1 , 3 , 2 ] |
8 | 0.816496580928 |
1 | # sampleStdev |
2 | x = sc.parallelize([ 1 , 3 , 2 ]) |
3 | y = x.sampleStdev() # divides by N-1 |
4 | print (x.collect()) |
5 | print (y) |
6 | [ 1 , 3 , 2 ] |
7 | 1.0 |
1 | # sampleVariance |
2 | x = sc.parallelize([ 1 , 3 , 2 ]) |
3 | y = x.sampleVariance() # divides by N-1 |
4 | print (x.collect()) |
5 | print (y) |
6 |
7 | [ 1 , 3 , 2 ] |
8 | 1.0 |
1 | # countByValue |
2 | x = sc.parallelize([ 1 , 3 , 1 , 2 , 3 ]) |
3 | y = x.countByValue() |
4 | print (x.collect()) |
5 | print (y) |
6 |
7 | [ 1 , 3 , 1 , 2 , 3 ] |
8 | defaultdict(< type 'int' >, { 1 : 2 , 2 : 1 , 3 : 2 }) |
1 | # top |
2 | x = sc.parallelize([ 1 , 3 , 1 , 2 , 3 ]) |
3 | y = x.top(num = 3 ) |
4 | print (x.collect()) |
5 | print (y) |
6 |
7 | [ 1 , 3 , 1 , 2 , 3 ] |
8 | [ 3 , 3 , 2 ] |
1 | # takeOrdered |
2 | x = sc.parallelize([ 1 , 3 , 1 , 2 , 3 ]) |
3 | y = x.takeOrdered(num = 3 ) |
4 | print (x.collect()) |
5 | print (y) |
6 |
7 | [ 1 , 3 , 1 , 2 , 3 ] |
8 | [ 1 , 1 , 2 ] |
1 | # take |
2 | x = sc.parallelize([ 1 , 3 , 1 , 2 , 3 ]) |
3 | y = x.take(num = 3 ) |
4 | print (x.collect()) |
5 | print (y) |
6 |
7 | [ 1 , 3 , 1 , 2 , 3 ] |
8 | [ 1 , 3 , 1 ] |
1 | # first |
2 | x = sc.parallelize([ 1 , 3 , 1 , 2 , 3 ]) |
3 | y = x.first() |
4 | print (x.collect()) |
5 | print (y) |
6 |
7 | [ 1 , 3 , 1 , 2 , 3 ] |
8 | 1 |
1 | # collectAsMap |
2 | x = sc.parallelize([( 'C' , 3 ),( 'A' , 1 ),( 'B' , 2 )]) |
3 | y = x.collectAsMap() |
4 | print (x.collect()) |
5 | print (y) |
6 |
7 | [( 'C' , 3 ), ( 'A' , 1 ), ( 'B' , 2 )] |
8 | { 'A' : 1 , 'C' : 3 , 'B' : 2 } |
1 | # keys |
2 | x = sc.parallelize([( 'C' , 3 ),( 'A' , 1 ),( 'B' , 2 )]) |
3 | y = x.keys() |
4 | print (x.collect()) |
5 | print (y.collect()) |
6 |
7 | [( 'C' , 3 ), ( 'A' , 1 ), ( 'B' , 2 )] |
8 | [ 'C' , 'A' , 'B' ] |
1 | # values |
2 | x = sc.parallelize([( 'C' , 3 ),( 'A' , 1 ),( 'B' , 2 )]) |
3 | y = x.values() |
4 | print (x.collect()) |
5 | print (y.collect()) |
6 |
7 | [( 'C' , 3 ), ( 'A' , 1 ), ( 'B' , 2 )] |
8 | [ 3 , 1 , 2 ] |
1 | # reduceByKey |
2 | x = sc.parallelize([( 'B' , 1 ),( 'B' , 2 ),( 'A' , 3 ),( 'A' , 4 ),( 'A' , 5 )]) |
3 | y = x.reduceByKey( lambda agg, obj: agg + obj) |
4 | print (x.collect()) |
5 | print (y.collect()) |
6 |
7 | [( 'B' , 1 ), ( 'B' , 2 ), ( 'A' , 3 ), ( 'A' , 4 ), ( 'A' , 5 )] |
8 | [( 'A' , 12 ), ( 'B' , 3 )] |
1 | # reduceByKeyLocally |
2 | x = sc.parallelize([( 'B' , 1 ),( 'B' , 2 ),( 'A' , 3 ),( 'A' , 4 ),( 'A' , 5 )]) |
3 | y = x.reduceByKeyLocally( lambda agg, obj: agg + obj) |
4 | print (x.collect()) |
5 | print (y) |
6 |
7 | [( 'B' , 1 ), ( 'B' , 2 ), ( 'A' , 3 ), ( 'A' , 4 ), ( 'A' , 5 )] |
8 | { 'A' : 12 , 'B' : 3 } |
联系客服