# Imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace
# All columns that are being cleaned
text_columns = [
"TITLE_RAW", "BODY", "SKILLS_NAME",
"SPECIALIZED_SKILLS_NAME",
"CERTIFICATIONS_NAME",
"COMMON_SKILLS_NAME",
"SOFTWARE_SKILLS_NAME", "URL",
"EDUCATION_LEVELS_NAME", "LIGHTCAST_SECTORS_NAME",
"CIP6_NAME", "CIP4_NAME", "CIP2_NAME",
"TITLE_NAME", "TITLE_CLEAN",
"COMPANY_NAME", "COMPANY_RAW",
"ONET", "ONET_NAME", "ONET_2019", "ONET_2019_NAME",
"SOC_2021_2_NAME", "SOC_2021_3_NAME", "SOC_2021_4_NAME", "SOC_2021_5_NAME",
"LOT_CAREER_AREA_NAME", "LOT_OCCUPATION_NAME", "LOT_SPECIALIZED_OCCUPATION_NAME",
"LOT_OCCUPATION_GROUP_NAME", "LOT_V6_SPECIALIZED_OCCUPATION_NAME",
"LOT_V6_OCCUPATION_NAME", "LOT_V6_OCCUPATION_GROUP_NAME", "LOT_V6_CAREER_AREA_NAME",
"NAICS_2022_6_NAME"
]
# clean text columns
def clean_text_columns(df, columns):
"""
Cleans text-heavy columns by:
- Removing brackets, quotes, and newlines
- Replacing multiple spaces with single space
- Standardizing commas with proper spacing
"""
pattern_cleanup = r'[\[\]\n{}"]' # remove brackets, newlines, braces, quotes
for c in columns:
df = df.withColumn(c, regexp_replace(col(c), pattern_cleanup, ""))
df = df.withColumn(c, regexp_replace(col(c), r'\s{2,}', ' ')) # multiple spaces → 1 space
df = df.withColumn(c, regexp_replace(col(c), r'\s*,\s*', ', ')) # standardize commas
return df
# Apply cleaning
df = clean_text_columns(df, text_columns)
# Preview cleaned text (Don't need to render for the project just for testing)
# df.select(text_columns).show(5, truncate=100)