{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c3bac98d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "23/01/26 09:55:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
      "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
      "Setting default log level to \"WARN\".\n",
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
     ]
    }
   ],
   "source": [
    "from __future__ import print_function\n",
    "import sys\n",
    "from random import random\n",
    "from operator import add\n",
    "from pyspark.sql import SparkSession\n",
    "import os\n",
    "import pyspark.sql.functions as f\n",
    "  \n",
    "spark = SparkSession.builder.appName(\"PyPi\").getOrCreate()\n",
    "df_all = spark.read.option('lineSep', r'(THE\\sEND)').text(\"./data/Shakespeare.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "bb3abe55",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df_all.withColumn('value', f.regexp_replace('value', r'<<[\\w\\s\\d\\n()\\,.-]{495}>>', ''))\\\n",
    "            .withColumn('value', f.explode(f.split('value', r'THE\\sEND', -1))) \\\n",
    "            .withColumn('index', f.monotonically_increasing_id())\\\n",
    "            .filter(\"index > 1\")\\\n",
    "            .filter('index < 38') \\\n",
    "            .withColumn(\"title\", f.regexp_extract('value', r'(.*)\\n*by', 0))\\\n",
    "            .withColumn(\"value\", f.regexp_replace('value', r'([A-Z ,]*)\\n*by William Shakespeare', ''))\\\n",
    "            .withColumn(\"title\", f.regexp_replace('title', r'\\n*by', ''))\\\n",
    "            .withColumn(\"year\", f.regexp_extract('value', r'\\d{4}', 0))\\\n",
    "            .withColumn(\"value\", f.regexp_replace('value', r'\\d{4}', ''))\\\n",
    "            .withColumn('value', f.trim('value'))\\\n",
    "            .withColumn('value', f.regexp_replace('value', r' {2,}', ' '))\\\n",
    "            .withColumn('value', f.regexp_replace('value', r'\\n{2,}', ''))\\\n",
    "            .withColumn('wordCount', f.size(f.split('value', ' ')))\\\n",
    "            .withColumn('lineCount', f.size(f.split('value', r'\\n')))\\\n",
    "            .orderBy(f.col(\"lineCount\").desc())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3991296f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "THE TRAGEDY OF HAMLET, PRINCE OF DENMARK, 3947 lines, 32079 words.\n",
      "KING RICHARD III, 3914 lines, 31193 words.\n",
      "THE TRAGEDY OF CORIOLANUS, 3691 lines, 29293 words.\n",
      "CYMBELINE, 3649 lines, 28870 words.\n",
      "THE TRAGEDY OF ANTONY AND CLEOPATRA, 3587 lines, 26552 words.\n",
      "THE TRAGEDY OF OTHELLO, MOOR OF VENICE, 3479 lines, 27986 words.\n",
      "THE TRAGEDY OF KING LEAR, 3433 lines, 27585 words.\n",
      "THE HISTORY OF TROILUS AND CRESSIDA, 3431 lines, 27623 words.\n",
      "KING HENRY THE EIGHTH, 3327 lines, 25886 words.\n",
      "THE WINTER'S TALE, 3249 lines, 26059 words.\n",
      "THE LIFE OF KING HENRY THE FIFTH, 3147 lines, 27498 words.\n",
      "THE SECOND PART OF KING HENRY THE SIXTH, 3133 lines, 26840 words.\n",
      "SECOND PART OF KING HENRY IV, 3101 lines, 27689 words.\n",
      "THE TRAGEDY OF ROMEO AND JULIET, 3089 lines, 25857 words.\n",
      "THE THIRD PART OF KING HENRY THE SIXTH, 3012 lines, 25873 words.\n",
      "THE FIRST PART OF KING HENRY THE FOURTH, 2926 lines, 25783 words.\n",
      "THE FIRST PART OF HENRY THE SIXTH, 2852 lines, 22883 words.\n",
      "KING RICHARD THE SECOND, 2851 lines, 23363 words.\n",
      "MEASURE FOR MEASURE, 2740 lines, 22947 words.\n",
      "LOVE'S LABOUR'S LOST, 2735 lines, 22987 words.\n",
      "KING JOHN, 2659 lines, 21776 words.\n",
      "THE TRAGEDY OF JULIUS CAESAR, 2629 lines, 20930 words.\n",
      "THE TRAGEDY OF TITUS ANDRONICUS, 2625 lines, 21701 words.\n",
      "THE TAMING OF THE SHREW, 2616 lines, 22243 words.\n",
      "THE MERCHANT OF VENICE, 2609 lines, 22309 words.\n",
      "THE MERRY WIVES OF WINDSOR, 2579 lines, 23411 words.\n",
      "AS YOU LIKE IT, 2543 lines, 22860 words.\n",
      "THE LIFE OF TIMON OF ATHENS, 2437 lines, 19691 words.\n",
      "MUCH ADO ABOUT NOTHING, 2425 lines, 22501 words.\n",
      "THE TRAGEDY OF MACBETH, 2396 lines, 18246 words.\n",
      "TWELFTH NIGHT; OR, WHAT YOU WILL, 2353 lines, 21208 words.\n",
      "THE TEMPEST, 2328 lines, 17498 words.\n",
      "THE TWO GENTLEMEN OF VERONA, 2196 lines, 18327 words.\n",
      "A MIDSUMMER NIGHT'S DREAM, 2119 lines, 17306 words.\n",
      "THE COMEDY OF ERRORS, 1815 lines, 15464 words.\n",
      "A LOVER'S COMPLAINT, 283 lines, 2579 words.\n"
     ]
    }
   ],
   "source": [
    "def play_counts(df):\n",
    "    results = df.select('title','lineCount', 'wordCount').collect()\n",
    "    for r in results:\n",
    "        print(f\"{r['title']}, {r['lineCount']} lines, {r['wordCount']} words.\")\n",
    "\n",
    "# df.filter(\"index == 37\").collect()\n",
    "play_counts(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "676fab8d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}