school-of-sre/big_data/evolution/index.html

1422 lines
37 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="shortcut icon" href="../../img/favicon.ico">
<meta name="generator" content="mkdocs-1.1.2, mkdocs-material-6.2.8">
<title>Evolution and Architecture of Hadoop - School Of SRE</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.cb6bc1d0.min.css">
<link rel="stylesheet" href="../../assets/stylesheets/palette.39b8e14a.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,400,400i,700%7CRoboto+Mono&display=fallback">
<style>body,input{font-family:"Roboto",-apple-system,BlinkMacSystemFont,Helvetica,Arial,sans-serif}code,kbd,pre{font-family:"Roboto Mono",SFMono-Regular,Consolas,Menlo,monospace}</style>
<link rel="stylesheet" href="../../stylesheets/custom.css">
</head>
<body dir="ltr" data-md-color-scheme="" data-md-color-primary="none" data-md-color-accent="none">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#evolution-of-hadoop" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<script async defer data-domain="linkedin.github.io" src="https://tracking.eskratch.com/js/plausible.js"></script>
<header class="md-header" data-md-component="header">
<nav class="md-header-nav md-grid" aria-label="Header">
<a href="../.." title="School Of SRE" class="md-header-nav__button md-logo" aria-label="School Of SRE">
<img src="../../img/sos.png" alt="logo">
</a>
<!-- Button to open drawer -->
<label class="md-header-nav__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2z"/></svg>
</label>
<div class="md-header-nav__title" data-md-component="header-title">
<div class="md-header-nav__ellipsis">
<span class="md-header-nav__topic md-ellipsis">
<a href="../.." title="School Of SRE" >
School Of SRE
</a>
</span>
<span class="md-header-nav__topic md-ellipsis">
Evolution and Architecture of Hadoop
</span>
</div>
</div>
<label class="md-header-nav__button md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0116 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 019.5 16 6.5 6.5 0 013 9.5 6.5 6.5 0 019.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
</label>
<div class="md-search" data-md-component="search" role="dialog">
<label class="md-search__overlay" for="__search"></label>
<div class="md-search__inner" role="search">
<form class="md-search__form" name="search">
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" data-md-state="active" required>
<label class="md-search__icon md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0116 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 019.5 16 6.5 6.5 0 013 9.5 6.5 6.5 0 019.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
</label>
<button type="reset" class="md-search__icon md-icon" aria-label="Clear" data-md-component="search-reset" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41L17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41z"/></svg>
</button>
</form>
<div class="md-search__output">
<div class="md-search__scrollwrap" data-md-scrollfix>
<div class="md-search-result" data-md-component="search-result">
<div class="md-search-result__meta">
Initializing search
</div>
<ol class="md-search-result__list"></ol>
</div>
</div>
</div>
</div>
</div>
</nav>
<div style="background-color: hsla(219, 54%, 51%, 1); border: 2px ; margin: 0px; padding: 2px; font-size: 200%; font-weight: bold; text-align: center;">
Join our AMA on <u><a href="https://linkedin.github.io/school-of-sre/linux_networking/intro/">Linux Networking Course</a></u> in the <u><a href="https://www.linkedin.com/groups/12493545/">School-of-SRE LinkedIn Group </a></u> from 19 Feb 4PM IST to 7PM IST(10.30AM to 1.30PM GMT)
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
Home
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle" data-md-toggle="nav-2" type="checkbox" id="nav-2" >
<label class="md-nav__link" for="nav-2">
Fundamentals Series
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" aria-label="Fundamentals Series" data-md-level="1">
<label class="md-nav__title" for="nav-2">
<span class="md-nav__icon md-icon"></span>
Fundamentals Series
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle" data-md-toggle="nav-2-1" type="checkbox" id="nav-2-1" >
<label class="md-nav__link" for="nav-2-1">
Linux Basics
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" aria-label="Linux Basics" data-md-level="2">
<label class="md-nav__title" for="nav-2-1">
<span class="md-nav__icon md-icon"></span>
Linux Basics
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../linux_basics/intro/" class="md-nav__link">
Introduction
</a>
</li>
<li class="md-nav__item">
<a href="../../linux_basics/command_line_basics/" class="md-nav__link">
Command Line Basics
</a>
</li>
<li class="md-nav__item">
<a href="../../linux_basics/linux_server_administration/" class="md-nav__link">
Server Administration
</a>
</li>
<li class="md-nav__item">
<a href="../../linux_basics/conclusion/" class="md-nav__link">
Conclusion
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle" data-md-toggle="nav-2-2" type="checkbox" id="nav-2-2" >
<label class="md-nav__link" for="nav-2-2">
Git
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" aria-label="Git" data-md-level="2">
<label class="md-nav__title" for="nav-2-2">
<span class="md-nav__icon md-icon"></span>
Git
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../git/git-basics/" class="md-nav__link">
Git Basics
</a>
</li>
<li class="md-nav__item">
<a href="../../git/branches/" class="md-nav__link">
Working With Branches
</a>
</li>
<li class="md-nav__item">
<a href="../../git/github-hooks/" class="md-nav__link">
Github and Hooks
</a>
</li>
<li class="md-nav__item">
<a href="../../git/conclusion/" class="md-nav__link">
Conclusion
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle" data-md-toggle="nav-2-3" type="checkbox" id="nav-2-3" >
<label class="md-nav__link" for="nav-2-3">
Linux Networking
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" aria-label="Linux Networking" data-md-level="2">
<label class="md-nav__title" for="nav-2-3">
<span class="md-nav__icon md-icon"></span>
Linux Networking
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../linux_networking/intro/" class="md-nav__link">
Introduction
</a>
</li>
<li class="md-nav__item">
<a href="../../linux_networking/dns/" class="md-nav__link">
DNS
</a>
</li>
<li class="md-nav__item">
<a href="../../linux_networking/udp/" class="md-nav__link">
UDP
</a>
</li>
<li class="md-nav__item">
<a href="../../linux_networking/http/" class="md-nav__link">
HTTP
</a>
</li>
<li class="md-nav__item">
<a href="../../linux_networking/tcp/" class="md-nav__link">
TCP
</a>
</li>
<li class="md-nav__item">
<a href="../../linux_networking/ipr/" class="md-nav__link">
Routing
</a>
</li>
<li class="md-nav__item">
<a href="../../linux_networking/conclusion/" class="md-nav__link">
Conclusion
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle" data-md-toggle="nav-3" type="checkbox" id="nav-3" >
<label class="md-nav__link" for="nav-3">
Python and Web
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" aria-label="Python and Web" data-md-level="1">
<label class="md-nav__title" for="nav-3">
<span class="md-nav__icon md-icon"></span>
Python and Web
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../python_web/intro/" class="md-nav__link">
Introduction
</a>
</li>
<li class="md-nav__item">
<a href="../../python_web/python-concepts/" class="md-nav__link">
Some Python Concepts
</a>
</li>
<li class="md-nav__item">
<a href="../../python_web/python-web-flask/" class="md-nav__link">
Python, Web and Flask
</a>
</li>
<li class="md-nav__item">
<a href="../../python_web/url-shorten-app/" class="md-nav__link">
The URL Shortening App
</a>
</li>
<li class="md-nav__item">
<a href="../../python_web/sre-conclusion/" class="md-nav__link">
Conclusion
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle" data-md-toggle="nav-4" type="checkbox" id="nav-4" checked>
<label class="md-nav__link" for="nav-4">
Data
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" aria-label="Data" data-md-level="1">
<label class="md-nav__title" for="nav-4">
<span class="md-nav__icon md-icon"></span>
Data
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle" data-md-toggle="nav-4-1" type="checkbox" id="nav-4-1" >
<label class="md-nav__link" for="nav-4-1">
Relational Databases
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" aria-label="Relational Databases" data-md-level="2">
<label class="md-nav__title" for="nav-4-1">
<span class="md-nav__icon md-icon"></span>
Relational Databases
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../databases_sql/intro/" class="md-nav__link">
Introduction
</a>
</li>
<li class="md-nav__item">
<a href="../../databases_sql/concepts/" class="md-nav__link">
Key Concepts
</a>
</li>
<li class="md-nav__item">
<a href="../../databases_sql/mysql/" class="md-nav__link">
MySQL
</a>
</li>
<li class="md-nav__item">
<a href="../../databases_sql/innodb/" class="md-nav__link">
InnoDB
</a>
</li>
<li class="md-nav__item">
<a href="../../databases_sql/operations/" class="md-nav__link">
Operational Concepts
</a>
</li>
<li class="md-nav__item">
<a href="../../databases_sql/lab/" class="md-nav__link">
Lab
</a>
</li>
<li class="md-nav__item">
<a href="../../databases_sql/conclusion/" class="md-nav__link">
Conclusion
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle" data-md-toggle="nav-4-2" type="checkbox" id="nav-4-2" >
<label class="md-nav__link" for="nav-4-2">
NoSQL
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" aria-label="NoSQL" data-md-level="2">
<label class="md-nav__title" for="nav-4-2">
<span class="md-nav__icon md-icon"></span>
NoSQL
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../databases_nosql/intro/" class="md-nav__link">
Introduction
</a>
</li>
<li class="md-nav__item">
<a href="../../databases_nosql/key_concepts/" class="md-nav__link">
Key Concepts
</a>
</li>
<li class="md-nav__item">
<a href="../../databases_nosql/further_reading/" class="md-nav__link">
Conclusion
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle" data-md-toggle="nav-4-3" type="checkbox" id="nav-4-3" checked>
<label class="md-nav__link" for="nav-4-3">
Big Data
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" aria-label="Big Data" data-md-level="2">
<label class="md-nav__title" for="nav-4-3">
<span class="md-nav__icon md-icon"></span>
Big Data
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../intro/" class="md-nav__link">
Introduction
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" data-md-toggle="toc" type="checkbox" id="__toc">
<a href="./" class="md-nav__link md-nav__link--active">
Evolution and Architecture of Hadoop
</a>
</li>
<li class="md-nav__item">
<a href="../tasks/" class="md-nav__link">
Conclusion
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle" data-md-toggle="nav-5" type="checkbox" id="nav-5" >
<label class="md-nav__link" for="nav-5">
Systems Design
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" aria-label="Systems Design" data-md-level="1">
<label class="md-nav__title" for="nav-5">
<span class="md-nav__icon md-icon"></span>
Systems Design
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../systems_design/intro/" class="md-nav__link">
Introduction
</a>
</li>
<li class="md-nav__item">
<a href="../../systems_design/scalability/" class="md-nav__link">
Scalability
</a>
</li>
<li class="md-nav__item">
<a href="../../systems_design/availability/" class="md-nav__link">
Availability
</a>
</li>
<li class="md-nav__item">
<a href="../../systems_design/fault-tolerance/" class="md-nav__link">
Fault Tolerance
</a>
</li>
<li class="md-nav__item">
<a href="../../systems_design/conclusion/" class="md-nav__link">
Conclusion
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle" data-md-toggle="nav-6" type="checkbox" id="nav-6" >
<label class="md-nav__link" for="nav-6">
Metrics and Monitoring
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" aria-label="Metrics and Monitoring" data-md-level="1">
<label class="md-nav__title" for="nav-6">
<span class="md-nav__icon md-icon"></span>
Metrics and Monitoring
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../metrics_and_monitoring/introduction/" class="md-nav__link">
Introduction
</a>
</li>
<li class="md-nav__item">
<a href="../../metrics_and_monitoring/command-line_tools/" class="md-nav__link">
Command-line Tools
</a>
</li>
<li class="md-nav__item">
<a href="../../metrics_and_monitoring/third-party_monitoring/" class="md-nav__link">
Third-party Monitoring
</a>
</li>
<li class="md-nav__item">
<a href="../../metrics_and_monitoring/alerts/" class="md-nav__link">
Proactive Monitoring with Alerts
</a>
</li>
<li class="md-nav__item">
<a href="../../metrics_and_monitoring/best_practices/" class="md-nav__link">
Best Practices for Monitoring
</a>
</li>
<li class="md-nav__item">
<a href="../../metrics_and_monitoring/observability/" class="md-nav__link">
Observability
</a>
</li>
<li class="md-nav__item">
<a href="../../metrics_and_monitoring/conclusion/" class="md-nav__link">
Conclusion
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle" data-md-toggle="nav-7" type="checkbox" id="nav-7" >
<label class="md-nav__link" for="nav-7">
Security
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" aria-label="Security" data-md-level="1">
<label class="md-nav__title" for="nav-7">
<span class="md-nav__icon md-icon"></span>
Security
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../security/intro/" class="md-nav__link">
Introduction
</a>
</li>
<li class="md-nav__item">
<a href="../../security/fundamentals/" class="md-nav__link">
Fundamentals of Security
</a>
</li>
<li class="md-nav__item">
<a href="../../security/network_security/" class="md-nav__link">
Network Security
</a>
</li>
<li class="md-nav__item">
<a href="../../security/threats_attacks_defences/" class="md-nav__link">
Threat, Attacks & Defences
</a>
</li>
<li class="md-nav__item">
<a href="../../security/writing_secure_code/" class="md-nav__link">
Writing Secure code
</a>
</li>
<li class="md-nav__item">
<a href="../../security/conclusion/" class="md-nav__link">
Conclusion
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../../CONTRIBUTING/" class="md-nav__link">
Contribute
</a>
</li>
<li class="md-nav__item">
<a href="../../CODE_OF_CONDUCT/" class="md-nav__link">
Code of Conduct
</a>
</li>
<li class="md-nav__item">
<a href="../../sre_community/" class="md-nav__link">
SRE Community
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
</nav>
</div>
</div>
</div>
<div class="md-content">
<article class="md-content__inner md-typeset">
<h1 id="evolution-of-hadoop">Evolution of Hadoop</h1>
<p><img alt="Evolution of hadoop" src="../images/hadoop_evolution.png" /></p>
<h1 id="architecture-of-hadoop">Architecture of Hadoop</h1>
<ol>
<li>
<p><strong>HDFS</strong></p>
<ol>
<li>The Hadoop Distributed File System (HDFS) is a distributed file system designed to run on commodity hardware. It has many similarities with existing distributed file systems. However, the differences from other distributed file systems are significant.</li>
<li>HDFS is highly fault-tolerant and is designed to be deployed on low-cost hardware. HDFS provides high throughput access to application data and is suitable for applications that have large data sets.</li>
<li>HDFS is part of the <a href="https://github.com/apache/hadoop">Apache Hadoop Core project</a>.</li>
</ol>
<p><img alt="HDFS Architecture" src="../images/hdfs_architecture.png" /></p>
<ol>
<li>NameNode: is the arbitrator and central repository of file namespace in the cluster. The NameNode executes the operations such as opening, closing, and renaming files and directories.</li>
<li>DataNode: manages the storage attached to the node on which it runs. It is responsible for serving all the read and writes requests. It performs operations on instructions on NameNode such as creation, deletion, and replications of blocks.</li>
<li>Client: Responsible for getting the required metadata from the namenode and then communicating with the datanodes for reads and writes. </br></br></br></li>
</ol>
</li>
<li>
<p><strong>YARN</strong></p>
<ol>
<li>YARN stands for “Yet Another Resource Negotiator“. It was introduced in Hadoop 2.0 to remove the bottleneck on Job Tracker which was present in Hadoop 1.0. YARN was described as a “Redesigned Resource Manager” at the time of its launching, but it has now evolved to be known as a large-scale distributed operating system used for Big Data processing.</li>
<li>The main components of YARN architecture include:</li>
</ol>
<p><img alt="YARN Architecture" src="../images/yarn_architecture.gif" /></p>
<ol>
<li>Client: It submits map-reduce(MR) jobs to the resource manager.</li>
<li>Resource Manager: It is the master daemon of YARN and is responsible for resource assignment and management among all the applications. Whenever it receives a processing request, it forwards it to the corresponding node manager and allocates resources for the completion of the request accordingly. It has two major components:</li>
<li>Scheduler: It performs scheduling based on the allocated application and available resources. It is a pure scheduler, which means that it does not perform other tasks such as monitoring or tracking and does not guarantee a restart if a task fails. The YARN scheduler supports plugins such as Capacity Scheduler and Fair Scheduler to partition the cluster resources.</li>
<li>Application manager: It is responsible for accepting the application and negotiating the first container from the resource manager. It also restarts the Application Manager container if a task fails.</li>
<li>Node Manager: It takes care of individual nodes on the Hadoop cluster and manages application and workflow and that particular node. Its primary job is to keep up with the Node Manager. It monitors resource usage, performs log management, and also kills a container based on directions from the resource manager. It is also responsible for creating the container process and starting it at the request of the Application master.</li>
<li>Application Master: An application is a single job submitted to a framework. The application manager is responsible for negotiating resources with the resource manager, tracking the status, and monitoring the progress of a single application. The application master requests the container from the node manager by sending a Container Launch Context(CLC) which includes everything an application needs to run. Once the application is started, it sends the health report to the resource manager from time-to-time.</li>
<li>Container: It is a collection of physical resources such as RAM, CPU cores, and disk on a single node. The containers are invoked by Container Launch Context(CLC) which is a record that contains information such as environment variables, security tokens, dependencies, etc. </br></br></li>
</ol>
</li>
</ol>
<h1 id="mapreduce-framework">MapReduce framework</h1>
<p><img alt="MapReduce Framework" src="../images/map_reduce.jpg" /></p>
<ol>
<li>The term MapReduce represents two separate and distinct tasks Hadoop programs perform-Map Job and Reduce Job. Map jobs take data sets as input and process them to produce key-value pairs. Reduce job takes the output of the Map job i.e. the key-value pairs and aggregates them to produce desired results.</li>
<li>Hadoop MapReduce (Hadoop Map/Reduce) is a software framework for distributed processing of large data sets on computing clusters. Mapreduce helps to split the input data set into a number of parts and run a program on all data parts parallel at once.</li>
<li>Please find the below Word count example demonstrating the usage of the MapReduce framework:</li>
</ol>
<p><img alt="Word Count Example" src="../images/mapreduce_example.jpg" />
</br></br></p>
<h1 id="other-tooling-around-hadoop">Other tooling around Hadoop</h1>
<ol>
<li><a href="https://hive.apache.org/"><strong>Hive</strong></a><ol>
<li>Uses a language called HQL which is very SQL like. Gives non-programmers the ability to query and analyze data in Hadoop. Is basically an abstraction layer on top of map-reduce.</li>
<li>Ex. HQL query:<ol>
<li><em>SELECT pet.name, comment FROM pet JOIN event ON (pet.name = event.name);</em></li>
</ol>
</li>
<li>In mysql:<ol>
<li><em>SELECT pet.name, comment FROM pet, event WHERE pet.name = event.name;</em></li>
</ol>
</li>
</ol>
</li>
<li>
<p><a href="https://pig.apache.org/"><strong>Pig</strong></a></p>
<ol>
<li>Uses a scripting language called Pig Latin, which is more workflow driven. Don't need to be an expert Java programmer but need a few coding skills. Is also an abstraction layer on top of map-reduce.</li>
<li>Here is a quick question for you:
What is the output of running the pig queries in the right column against the data present in the left column in the below image?</li>
</ol>
<p><img alt="Pig Example" src="../images/pig_example.png" /></p>
<p>Output:
<code>7,Komal,Nayak,24,9848022334,trivendram
8,Bharathi,Nambiayar,24,9848022333,Chennai
5,Trupthi,Mohanthy,23,9848022336,Bhuwaneshwar
6,Archana,Mishra,23,9848022335,Chennai</code></p>
</li>
<li>
<p><a href="https://spark.apache.org/"><strong>Spark</strong></a></p>
<ol>
<li>Spark provides primitives for in-memory cluster computing that allows user programs to load data into a clusters memory and query it repeatedly, making it well suited to machine learning algorithms.</li>
</ol>
</li>
<li><a href="https://prestodb.io/"><strong>Presto</strong></a><ol>
<li>Presto is a high performance, distributed SQL query engine for Big Data.</li>
<li>Its architecture allows users to query a variety of data sources such as Hadoop, AWS S3, Alluxio, MySQL, Cassandra, Kafka, and MongoDB.</li>
<li>Example presto query:
<code>use studentDB;
show tables;
SELECT roll_no, name FROM studentDB.studentDetails where section=A limit 5;</code>
</br></li>
</ol>
</li>
</ol>
<h1 id="data-serialisation-and-storage">Data Serialisation and storage</h1>
<ol>
<li>In order to transport the data over the network or to store on some persistent storage, we use the process of translating data structures or objects state into binary or textual form. We call this process serialization..</li>
<li>Avro data is stored in a container file (a .avro file) and its schema (the .avsc file) is stored with the data file.</li>
<li>Apache Hive provides support to store a table as Avro and can also query data in this serialisation format.</li>
</ol>
</article>
</div>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-nav">
<nav class="md-footer-nav__inner md-grid" aria-label="Footer">
<a href="../intro/" class="md-footer-nav__link md-footer-nav__link--prev" rel="prev">
<div class="md-footer-nav__button md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
</div>
<div class="md-footer-nav__title">
<div class="md-ellipsis">
<span class="md-footer-nav__direction">
Previous
</span>
Introduction
</div>
</div>
</a>
<a href="../tasks/" class="md-footer-nav__link md-footer-nav__link--next" rel="next">
<div class="md-footer-nav__title">
<div class="md-ellipsis">
<span class="md-footer-nav__direction">
Next
</span>
Conclusion
</div>
</div>
<div class="md-footer-nav__button md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11H4z"/></svg>
</div>
</a>
</nav>
</div>
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-footer-copyright">
<div class="md-footer-copyright__highlight">
Copyright 2020 LinkedIn Corporation. All Rights Reserved. Licensed under the Creative Commons Attribution 4.0 International Public License
</div>
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
<div class="md-footer-social">
<a href="https://github.com/linkedin/school-of-sre" target="_blank" rel="noopener" title="github.com" class="md-footer-social__link">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 496 512"><path d="M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z"/></svg>
</a>
</div>
</div>
</div>
</footer>
</div>
<script src="../../assets/javascripts/vendor.18f0862e.min.js"></script>
<script src="../../assets/javascripts/bundle.994580cf.min.js"></script><script id="__lang" type="application/json">{"clipboard.copy": "Copy to clipboard", "clipboard.copied": "Copied to clipboard", "search.config.lang": "en", "search.config.pipeline": "trimmer, stopWordFilter", "search.config.separator": "[\\s\\-]+", "search.placeholder": "Search", "search.result.placeholder": "Type to start searching", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.term.missing": "Missing"}</script>
<script>
app = initialize({
base: "../..",
features: [],
search: Object.assign({
worker: "../../assets/javascripts/worker/search.9c0e82ba.min.js"
}, typeof search !== "undefined" && search)
})
</script>
</body>
</html>