Doing quick and dirty URL fetch from hive, I wanted for URL"s to be ditributed among 5 jobs. Input is small it's very hard to tune up on mapper side things to heppen on 5 mappers say.
Regular:
insert overwrite table url_raw_contant partition(dt = 20130606)
select full_url,
priority,
regexp_replace(curl_url(full_url), '\n|\r', ' ') as raw_html
from url_queue_table_sharded_temp;
Forced UDF execution to Reducer (5 reducers):
set mapred.reduce.tasks=5;
insert overwrite table url_raw_contant_table partition(dt = 20130606)
select full_url,
priority,
regexp_replace(curl_url(full_url), '\n|\r', ' ') as raw_html
from (
select full_url, priority
from url_queue_table_sharded_temp
distribute by md5(full_url) % 5
sort by md5(full_url) % 5, priority desc
) d
distribute by md5(full_url) % 5;
Regular:
insert overwrite table url_raw_contant partition(dt = 20130606)
select full_url,
priority,
regexp_replace(curl_url(full_url), '\n|\r', ' ') as raw_html
from url_queue_table_sharded_temp;
Forced UDF execution to Reducer (5 reducers):
set mapred.reduce.tasks=5;
insert overwrite table url_raw_contant_table partition(dt = 20130606)
select full_url,
priority,
regexp_replace(curl_url(full_url), '\n|\r', ' ') as raw_html
from (
select full_url, priority
from url_queue_table_sharded_temp
distribute by md5(full_url) % 5
sort by md5(full_url) % 5, priority desc
) d
distribute by md5(full_url) % 5;





