from pyspark import SparkContext

# Initialize SparkContext
sc = SparkContext("local", "RDD Example")

24/08/18 10:08:58 WARN Utils: Your hostname, AlienDVD resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/08/18 10:08:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/18 10:08:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable

# Create an RDD from a Python list
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

print(type(rdd))

# Collect the RDD to see its contents
print(rdd.collect())

<class 'pyspark.rdd.RDD'>
[1, 2, 3, 4, 5]

# Create an RDD from a text file
rdd = sc.textFile("resources/people.txt")

print(type(rdd))

<class 'pyspark.rdd.RDD'>

# Collect the RDD
print(rdd.collect())

['Mohsen 49', 'Jan 35', 'Jos 30', 'Arash 25', 'Eva 28', 'Frank 40', 'Helma 33', 'Dean 31', 'Alican 22', 'Zhanna 27']

# Create an RDD from a list
data = ["apple", "banana", "cherry", "date"]
rdd = sc.parallelize(data)

# Transform the RDD using map to create a new RDD
upper_rdd = rdd.map(lambda x: x.upper())

# Collect the transformed RDD
print(upper_rdd.collect())

['APPLE', 'BANANA', 'CHERRY', 'DATE']

1. Parallelizing a Collection¶

2. From an External Dataset¶

3. From Existing RDDs¶