If you want to learn Hadoop, Spark and Python (PySpark), we have published a Docker container to facilitate your learning efforts. The source code is available on GitHub and the container is published on Docker Hub. An example notebook is provided to get you jump started as well (see below).
num_rdd = sc.parallelize(list(range(10000)))
num_rdd.map(lambda x: x * x).reduce(lambda a, b: a + b)
import random
import pandas as pd
def get_data():
n_rows = 100
n_cols = 10
for r in range(n_rows):
yield {'x{}'.format(c): random.randint(0, 100) for c in range(n_cols)}
df = pd.DataFrame(get_data())
df.to_csv('data.csv', index=False)
data_rdd = sc.textFile('hdfs://localhost/data.csv')
data_df = spark.read.load('hdfs://localhost/data.csv', format='com.databricks.spark.csv',header='true',sep=',',inferSchema='true')
data_df.printSchema()
from pyspark.sql.functions import lit
from graphframes import GraphFrame
v = sqlContext.createDataFrame([
("a", "Alice", 34),
("b", "Bob", 36),
("c", "Charlie", 30),
("d", "David", 29),
("e", "Esther", 32),
("f", "Fanny", 36),
("g", "Gabby", 60)
], ["id", "name", "age"]) \
.withColumn("entity", lit("person"))
e = sqlContext.createDataFrame([
("a", "b", "friend"),
("b", "c", "follow"),
("c", "b", "follow"),
("f", "c", "follow"),
("e", "f", "follow"),
("e", "d", "friend"),
("d", "a", "friend"),
("a", "e", "friend")
], ["src", "dst", "relationship"])
g = GraphFrame(v, e)
g.vertices
g.edges