forked from rjurney/pig-to-json
-
Notifications
You must be signed in to change notification settings - Fork 1
/
test.pig
57 lines (48 loc) · 3.66 KB
/
test.pig
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
/* Load Avro jars and define shortcut */
register /me/Software/pig/build/ivy/lib/Pig/avro-1.5.3.jar
register /me/Software/pig/build/ivy/lib/Pig/json-simple-1.1.jar
register /me/Software/pig/contrib/piggybank/java/piggybank.jar
define AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage();
register /me/Software/pig-to-json/dist/lib/pig-to-json.jar
-- Enron emails are available at https://s3.amazonaws.com/rjurney_public_web/hadoop/enron.avro
emails = load '/me/Data/enron.avro' using AvroStorage();
emails = limit emails 10;
/* describe emails
emails: {message_id: chararray,
date: chararray,
from: (address: chararray,name: chararray),
subject: chararray,
body: chararray,
tos: {ARRAY_ELEM: (address: chararray,name: chararray)},
ccs: {ARRAY_ELEM: (address: chararray,name: chararray)},
bccs: {ARRAY_ELEM: (address: chararray,name: chararray)}
} */
json_test = foreach emails generate message_id, com.hortonworks.pig.udf.ToJson(tos) as bag_json;
dump json_test
/* (<589.1075842593084.JavaMail.evans@thyme>,[{"address":"[email protected]","name":null}])
(<614.1075847580822.JavaMail.evans@thyme>,[{"address":"[email protected]","name":null},{"address":"[email protected]","name":null},{"address":"[email protected]","name":null},{"address":"[email protected]","name":null}])
(<735.1075840186524.JavaMail.evans@thyme>,[{"address":"[email protected]","name":"Kam Keiser"},{"address":"[email protected]","name":"Mike Grigsby"}])
(<758.1075842602845.JavaMail.evans@thyme>,[{"address":"[email protected]","name":null}])
(<765.1075860359973.JavaMail.evans@thyme>,[{"address":"[email protected]","name":null},{"address":"[email protected]","name":null}]) */
emails2 = load '/me/Data/enron.avro' using AvroStorage();
emails2 = limit emails2 10;
json_test2 = foreach emails2 generate message_id, com.hortonworks.pig.udf.ToJson(from) as tuple_json;
dump json_test2
/* (<28.1075842613917.JavaMail.evans@thyme>,{"address":"[email protected]","name":"\"Emmye\""})
(<85.1075854368299.JavaMail.evans@thyme>,{"address":"[email protected]","name":null})
(<167.1075851646300.JavaMail.evans@thyme>,{"address":"[email protected]","name":"Jeff Dasovich"})
(<185.1075857304356.JavaMail.evans@thyme>,{"address":"[email protected]","name":"Chris Dorland"})
(<735.1075840186524.JavaMail.evans@thyme>,{"address":"[email protected]","name":"Phillip M. Love"})
(<758.1075842602845.JavaMail.evans@thyme>,{"address":"[email protected]","name":null})
(<765.1075860359973.JavaMail.evans@thyme>,{"address":"[email protected]","name":null}) */
-- This works for arbitrarily complex data structures as well
a = foreach (group emails by from.address) generate group as from_address, COUNT_STAR(emails) as sent_count, FLATTEN(emails.tos) as tos;
b = group a by from_address;
c = foreach b generate group as from_address, com.hortonworks.pig.udf.ToJson(a) as json_test;
store c into '/tmp/big_test_num';
/* [email protected] [{"tos":[{"address":"[email protected]","name":null}],"sent_count":1,"from_address":"[email protected]"}]
[email protected] [{"tos":[{"address":"[email protected]","name":null}],"sent_count":1,"from_address":"[email protected]"}]
[email protected] [{"tos":[{"address":"[email protected]","name":null}],"sent_count":1,"from_address":"[email protected]"}]
[email protected] [{"tos":[{"address":"[email protected]","name":"Shena Cherian"}],"sent_count":1,"from_address":"[email protected]"}]
[email protected] [{"tos":[{"address":"[email protected]","name":null}],"sent_count":1,"from_address":"[email protected]"}]
[email protected] [{"tos":[{"address":"[email protected]","name":null}],"sent_count":1,"from_address":"[email protected]"}] */