%%HTML
<script src="require.js"></script>

from IPython.display import display, HTML, clear_output
HTML('''
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.3/jquery.min.js "></script><script>
code_show=true; 
function code_toggle() {
if (code_show){
$('div.jp-CodeCell > div.jp-Cell-inputWrapper').hide();
} else {
$('div.jp-CodeCell > div.jp-Cell-inputWrapper').show();
}
code_show = !code_show
} 
$( document ).ready(code_toggle);</script><form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>
''')

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import re
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import warnings
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import json

def truncated_svd(X):
    """
    Accept the design matrix and return the q, sigma, p, and the normalized
    sum of squared distance from the origin.

    Parameters
    ----------
    X : array-like
        Design matrix
    
    Returns
    ----------
    q : array-like
        Q
    
    sigma : array-like
        Singular values
    
    p : array-like
        P
        
    nssd : array-like
        Normalized sum of square distance from the origin
    """
    q, sigma, p = np.linalg.svd(X)
    p = p.T
    s_diag = np.diag(sigma)
    nssd = (sigma**2) / np.sum(sigma**2)
    return q, s_diag, p, nssd

def project_svd(q, s, k):
    """
    Accept q, s, and k, and return the design matrix projected on to the first
    k singular vec

    Parameters
    ----------
    X : array-like
        Design matrix
    
    Returns
    ----------
    X_new : array-like
        Rotated design matrix
    
    w : array-like
        New coordinate system
    
    variance_explained : array-like
        Variance explained
    """
    Q_truncated = q[:, :k]
    Sigma_truncated = s[:k, :k]
    Q_projected = np.dot(Q_truncated, Sigma_truncated)
    return Q_projected

def plot_svd(X_new, features, p):
    """
    Plot transformed data and features on to the first two singular vectors

    Parameters
    ----------
    X_new : array
        Transformed data
    features : sequence of str
        Feature names
    p : array
        P matrix
    """
    fig, ax = plt.subplots(1, 2, subplot_kw=dict(aspect='equal'),
                           gridspec_kw=dict(wspace=0.4), dpi=150)
    ax[0].scatter(X_new[:,0], X_new[:,1])
    ax[0].set_xlabel('SV1')
    ax[0].set_ylabel('SV2')

    for feature, vec in zip(features, p):
        ax[1].arrow(0, 0, vec[0], vec[1], width=0.001, ec='none', fc='skyblue')
        ax[1].text(vec[0], vec[1], feature, ha='center', color='black', fontsize=5)
    ax[1].set_xlabel('SV1')
    ax[1].set_ylabel('SV2')

def grey_color_func(word,
                    font_size,
                    position,
                    orientation,
                    random_state=None,
                    **kwargs):
    """
    Generate a color in varying shades of gray for a given word.

    Parameters
    ----------
    word : str
        Word
        
    font_size : int
        Font size
    
    position : int
        Position
        
    orientation : int
        Orientation
        
    random_state : int
        Random state
        
    **kwargs
        Additional arguments

    Returns:
    ----------
    str 
        An HSL color string representing a shade of gray.
    """
    return "hsl(0, 0%%, %d%%)" % np.random.randint(60, 100)

def parse_classification(classification_str):
    """
    Parse classification of jobs from dict-like data.

    Parameters
    ----------
    classification_str : array-like
        Dict-like string
    
    Returns
    ----------
    id_part : str
        ID part of string
        
    description_part : str
        Description part of string
    """
    parts = classification_str.strip("{}").split("', '")

    id_part = parts[0].split(": ")[1].strip("'")
    description_part = parts[1].split(": ")[1].strip("'")

    return id_part, description_part

def parse_location(location_str):
    """
    Parse locations  from dict-like data.

    Parameters
    ----------
    location_str : array-like
        Dict-like string
    
    Returns
    ----------
    label : str
        Label part of location
        
    countryCode : str
        Country code
        
    seoHierarchy : str
        Hierarchy in terms of Search Engine Optimization (SEO)
    """
    location_dict = json.loads(location_str.replace("'", '"'))

    label = location_dict.get("label", "")
    countryCode = location_dict.get("countryCode", "")
    seoHierarchy = json.dumps(location_dict.get("seoHierarchy", []))

    return label, countryCode, seoHierarchy

def remove_empty_strs(df):
    """
    Remove empty strings from dataframe.

    Parameters
    ----------
    df
        Dataframe
    
    Returns
    ----------
    df
        Dataframe
    """
    rows_to_drop = df[df['skill']==''].index
    df.drop(rows_to_drop, inplace=True)
    return df

def word_cloud_svd(num):
    """
    Create word cloud from `p` from the decomposed matrix, where the weights of
    each token is acquired from p.

    Parameters
    ----------
    num
        Number of SV minus one
    """
    order = np.argsort(np.abs(p[:, num]))[-15:]
    weight = np.abs(p[:, num][order])
    weight_check = p[:, num][order]
    word_could_dict = Counter(feature_names[order])
    global lst_negativity 
    lst_negativity = []
    global lst_positivity
    lst_positivity = []
    for (keys, values), i, sign in zip(word_could_dict.items(), weight, 
                                       weight_check):
        word_could_dict[keys] *= 100 * i
        word_could_dict[keys] = int(word_could_dict[keys])
        if sign > 0:
            lst_positivity.append(keys)
        else:
            lst_negativity.append(keys)
    wordcloud = WordCloud(color_func=color,width=2000, height=1200, margin=3, 
                          scale=1, prefer_horizontal=0.7,
                          background_color='white', 
                          relative_scaling=0.000001).generate_from_frequencies(
        word_could_dict)
    plt.imshow(wordcloud)
    plt.title(f"SV {num + 1}")
    plt.axis("off")
    plt.show()

def nssd_cum(nssd):
    """
    Plot normalized sum of squared differences (nssd) and cumulative sum of
    nssd.

    Parameters
    ----------
    nssd
        Normalized sum of squared differences
    """
    fig, ax = plt.subplots()
    ax.plot(range(1, len(nssd)+1), nssd, '-', label='individual')
    ax.set_xlim(0, len(nssd)+1)
    ax.set_xlabel('SV')
    ax.set_ylabel('variance explained')
    ax = ax.twinx()
    ax.plot(range(1, len(nssd)+1), nssd.cumsum(), 'r-', label='cumulative')
    ax.axhline(0.8, ls='--', color='g')
    ax.axvline(70, ls='--', color='g')
    ax.set_ylabel('cumulative variance explained')
    ax.set_title('Variance Explained vs. Cum. Variance Explained')
    plt.show()

def color(word, font_size, position, orientation, font_path, random_state):
    """
    Signify which color to use for a given word.

    Parameters
    ----------
    word : str
        Word
        
    font_size : int
        Font size
        
    position : int
        Position
        
    orientation: int
        Orientation
        
    font_path : int
        Font Path
        
    random_state : int
        Random state
    
    Returns
    ----------
    str
        Hex color code
    """
    if word in lst_positivity and word in df_classification.columns:
        return '#C06832'
    elif word in lst_positivity and word in skills_df.columns:
        return '#FF8A47'
    elif word in lst_negativity and word in df_classification.columns:
        return '#412395'
    else:
        return '#B298FC'

warnings.filterwarnings("ignore", category=FutureWarning)

classification = df_explore['classification'].value_counts(
    ).head(10).sort_values()

plt.figure(figsize=(10, 6))
plt.barh(classification.index,
        classification.values,
        color=['gray', 'gray', 'gray', 'gray', 'gray',
               'gray', 'gray', 'black', 'black', 'black',])

plt.xticks(rotation=0, ha='right')
plt.xlabel('Number of Classifications')
plt.ylabel('Classification')
plt.title('Distribution of Job Classification')

# Display the plot
plt.tight_layout()
plt.show()

combined_text = " ".join(review for review in df['skill'].dropna())

wordcloud = WordCloud(background_color="white",
                      collocations=False).generate(combined_text)

wordcloud.recolor(color_func=grey_color_func)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

js_params = {
    'siteKey':'PH-Main',
    'sourcesystem':'houston',
    'userqueryid':'29f535e70ed6837e41115b411d0e4f34-0245335',
    'userid':'1430f579-f0af-4b9a-b969-77535c91cc72',
    'usersessionid':'1430f579-f0af-4b9a-b969-77535c91cc72',
    'eventCaptureSessionId':'1430f579-f0af-4b9a-b969-77535c91cc72&',
    'where':'Philippines',
    'page':'1',
    'seekSelectAllPages':'true',
    'keywords':'data',
    'pageSize':'100',
    'include':'seodata',
    'locale':'en-PH',
    'solId':'cf9086ae-797a-4772-8f88-8dd47c63fad0'
}

js_joblist = []

for pagenum in range(0, 105):
    js_params['page'] = str(pagenum)

    response = requests.get(
        'https://www.jobstreet.com.ph/api/chalice-search/v4/search',
        headers=js_headers,
        params=js_params
    )

    time.sleep(1)
    
    js_joblist.extend(response.json()['data'])

conn = sqlite3.connect('jobs_final.db')

df_raw = pd.read_sql('SELECT * FROM jobs_raw', conn)
df_raw.head(3)

job_data = []

for job_id in todo_ids_list:
    try:
        content = requests.get(f'https://www.jobstreet.com.ph/job/%7Bjob_id%7D').text
        soup = BeautifulSoup(content, 'lxml')

        jobtitle = soup.find('h1', class='_1wkzzau0 a1msqi4y lnocuo0 lnocuol _1d0g9qk4 lnocuop lnocuo21').text
        jobdetails = soup.find('div', class='_1wkzzau0 _1pehz540').text

        job_details = re.sub(r'[\xa0-\xff]', '', job_details)
        job_details = re.sub(r'[\u2000-\u200F]', '', job_details)
        job_details = job_details.replace('·', '')
        job_details = job_details.replace('•', '')

        job_data.append({'id': job_id, 'job_title': job_title, 'job_details': job_details})
    except Exception as e:
        print(f"Error processing job ID {job_id}: {e}")

df_details = pd.read_sql('SELECT * FROM jobs_details', conn)
df_details.head(3)

df_consolidated = df_raw
df_consolidated = df_consolidated.drop(['automaticInclusion'], axis=1)
df_consolidated = df_consolidated.drop(['companyProfileStructuredDataId',
                'displayStyle', 'displayType',
                'listingDateDisplay', 'isPrivateAdvertiser'], axis=1)
df_consolidated = df_consolidated.drop(['advertiser', 'branding',
                'bulletPoints', 'locationId',
                'isPremium', 'logo'], axis=1)
df_consolidated = df_consolidated.drop(['isStandOut', 'listingDate',
                'tags', 'searchInsights'], axis=1)
df_consolidated = df_consolidated.drop(['tracking', 'areaId', 'solMetadata'], axis=1)
df_consolidated = df_consolidated.drop(['area', 'areaWhereValue',
                'suburbId', 'locationWhereValue',
                'suburbWhereValue'], axis=1)

df_consolidated.head(3)

df_consolidated["classification_id"], \
df_consolidated["classification_description"] = zip(
    *df_consolidated["classification"].apply(parse_classification))

df_consolidated = df_consolidated.drop(['classification'], axis=1)
df_consolidated = df_consolidated.drop(['classification_id'], axis=1)

df_consolidated['subClassification_id'], \
df_consolidated['subClassification_description'] = zip(
    *df_consolidated['subClassification'].apply(parse_classification))

df_consolidated = df_consolidated.drop(['subClassification'], axis=1)
df_consolidated = df_consolidated.drop(['subClassification_id'], axis=1)

df_consolidated["jobLocation_label"], \
    df_consolidated["jobLocation_countryCode"], \
    df_consolidated["jobLocation_seoHierarchy"], = zip(
    *df_consolidated["jobLocation"].apply(parse_location))

df_consolidated = df_consolidated.drop(['jobLocation_countryCode'], axis=1)
df_consolidated = df_consolidated.drop(['jobLocation_seoHierarchy'], axis=1)
df_consolidated = df_consolidated.drop(['jobLocation'], axis=1)

df_consolidated.head(3)

# Merge the DataFrames on 'id'
df_combined = df_consolidated.merge(df_details, left_on="id", 
                        right_on="jobId", how="inner")

# Drop duplicates based on the 'id' column if needed
df_combined = df_combined.drop_duplicates(subset="id")
df_combined = df_combined.drop(['title'], axis=1)
df_combined.head(3)

df_combined.rename(columns = {'classification_description': 'classification',
            'subClassification_description': 'subClassification',
            'jobLocation_label': 'jobLocation'},
            inplace = True)
cols = ['id', 'jobTitle', 'jobDetails', 'companyName', 'location', 'roleId',
        'salary', 'teaser', 'workType', 'currencyLabel', 'suburb',
        'classification', 'subClassification', 'jobLocation']
df_combined = df_combined[cols]

df_combined.head(3)

conn = sqlite3.connect('jobs_final.db')
df_combined.to_sql('jobs', conn, if_exists='replace', index=False)

'SELECT * FROM jobs'

conn = sqlite3.connect('jobs_final.db')
cursor = conn.cursor()
df = pd.read_sql_query("SELECT * FROM jobs", conn)

df.head(3)

patterns = [
    (r'([a-z])([A-Z])', r'\1 \2'),
    (r'([.])([A-Z])', r'\1 \2'),
    (r'([:])([A-Z])', r'\1 \2'),
    (r'\u202f', ' '),
    (r'·', '')
]

patterns = [
    (r'([a-z])([A-Z])', r'\1 \2'),
    (r'([.])([A-Z])', r'\1 \2'),
    (r'([:])([A-Z])', r'\1 \2'),
    (r'\u202f', ' '),
    (r'·', '')
]

for pattern, replacement in patterns:
    df['jobDetails'] = df['jobDetails'].apply(lambda x: re.sub(pattern, 
                                                               replacement, x))
    
df.iloc[0]['jobDetails']

'JOB SUMMARY: Intraday specialist is a critical role holding responsibilities to support and manage the Real Time monitoring activities of contact center operations. Intraday specialist must ensure the execution of workforce plans with focus on adherence and achievement of Service Level objectives through real time interventions. KEY RESPONSIBILITIES AND DUTIES: Good understanding of Real Time Management and its deliverables Knowledge of WFM tool like, Verint, Genesys, Aspect, IEX. Creating a game plan for the day and ensuring right people in the right place at the right time Working knowledge of the operation SLAs, GOS, Productivity, Utilization & other KPIs Performing Real Time SL Escalation Procedures Ensuring Service Levels are met daily and if they are not met, complete root cause analysis Ensure timely delivery of reports with proper insights Perform Skill Audits Managing Intraday Plan and communication of intraday performance Tracking documentation and communication of System Issues Communication, tracking and entering overtime (OT) and VTO availability on an intraday basis Perform other duties and assignments as directed QUALIFICATIONS: 3 to 4 years of relevant experience Proficient in MS Word, MS Excel, MS Power Point, VBA, Advance Excel, etc. Proficient in at least one WFM tool (Genesys, NICE, Aspect, Calabrio, Teleopti, Verint etc.) Hands on experience in working with Telephony databases Strong mathematical, analytical, communication, and organization skills Self-motivated and must excel in a highly dynamic work environment Willing to work onsite'

df = df[
    (df["jobTitle"].str.contains("|".join(["data", "analyst"]), case=False))
    | (df["jobDetails"].str.contains("|".join(["data", "analyst"]), case=False)
      )
    | (df["teaser"].str.contains("|".join(["data", "analyst"]), case=False))
]

# Find the index of the first row where 'job_details' column contains 'skill'
index_with_skill = df.index[df['job_details'].str.contains('skill')].tolist()

# If there is at least one row with 'skill', get the first row
if index_with_skill:
    first_row_with_skill = df.loc[index_with_skill[0]]
    print(first_row_with_skill)
else:
    print("No row contains 'skill' in the 'job_details' column.")

%%capture
# Find the index of the first row where 'job_details' column contains 'skill'
index_with_skill = df.index[df['jobDetails'].str.contains('skill')].tolist()

# If there is at least one row with 'skill', get the first row
if index_with_skill:
    first_row_with_skill = df.loc[index_with_skill[0]]
    print(first_row_with_skill)
else:
    print("No row contains 'skill' in the 'job_details' column.")

df[df['id'] == 70301897]['jobDetails']
df.iloc[0]['jobDetails']

'JOB SUMMARY: Intraday specialist is a critical role holding responsibilities to support and manage the Real Time monitoring activities of contact center operations. Intraday specialist must ensure the execution of workforce plans with focus on adherence and achievement of Service Level objectives through real time interventions. KEY RESPONSIBILITIES AND DUTIES: Good understanding of Real Time Management and its deliverables Knowledge of WFM tool like, Verint, Genesys, Aspect, IEX. Creating a game plan for the day and ensuring right people in the right place at the right time Working knowledge of the operation SLAs, GOS, Productivity, Utilization & other KPIs Performing Real Time SL Escalation Procedures Ensuring Service Levels are met daily and if they are not met, complete root cause analysis Ensure timely delivery of reports with proper insights Perform Skill Audits Managing Intraday Plan and communication of intraday performance Tracking documentation and communication of System Issues Communication, tracking and entering overtime (OT) and VTO availability on an intraday basis Perform other duties and assignments as directed QUALIFICATIONS: 3 to 4 years of relevant experience Proficient in MS Word, MS Excel, MS Power Point, VBA, Advance Excel, etc. Proficient in at least one WFM tool (Genesys, NICE, Aspect, Calabrio, Teleopti, Verint etc.) Hands on experience in working with Telephony databases Strong mathematical, analytical, communication, and organization skills Self-motivated and must excel in a highly dynamic work environment Willing to work onsite'

skill_pattern = r'(?:skill).*?\s(.*)'

skill_pattern = r'(?:skill).*?\s(.*)'

warnings.filterwarnings('ignore')

# Apply the pattern to extract words after 'skill'
df["skill"] = df["jobDetails"].apply(
    lambda x: re.search(skill_pattern, x, flags=re.IGNORECASE).group(1)
    if re.search(skill_pattern, x, flags=re.IGNORECASE)
    else ""
)

# Display the DataFrame
print(df.iloc[0]["skill"])

Audits Managing Intraday Plan and communication of intraday performance Tracking documentation and communication of System Issues Communication, tracking and entering overtime (OT) and VTO availability on an intraday basis Perform other duties and assignments as directed QUALIFICATIONS: 3 to 4 years of relevant experience Proficient in MS Word, MS Excel, MS Power Point, VBA, Advance Excel, etc. Proficient in at least one WFM tool (Genesys, NICE, Aspect, Calabrio, Teleopti, Verint etc.) Hands on experience in working with Telephony databases Strong mathematical, analytical, communication, and organization skills Self-motivated and must excel in a highly dynamic work environment Willing to work onsite

df = df[['classification', 'skill']]
df_explore = df[['classification', 'skill']]
df_classification = df['classification']
df.head(3)

df_classification = pd.get_dummies(df_classification, dtype=int)
df = pd.concat((df, df_classification), axis=1)
df = df.drop('classification', axis=1)
df.head(3)

df = remove_empty_strs(df)

df.isna().sum()

skill                                     0
Accounting                                0
Administration & Office Support           0
Advertising, Arts & Media                 0
Banking & Financial Services              0
CEO & General Management                  0
Call Centre & Customer Service            0
Community Services & Development          0
Construction                              0
Consulting & Strategy                     0
Design & Architecture                     0
Education & Training                      0
Engineering                               0
Farming, Animals & Conservation           0
Government & Defence                      0
Healthcare & Medical                      0
Hospitality & Tourism                     0
Human Resources & Recruitment             0
Information & Communication Technology    0
Insurance & Superannuation                0
Legal                                     0
Manufacturing, Transport & Logistics      0
Marketing & Communications                0
Mining, Resources & Energy                0
Real Estate & Property                    0
Retail & Consumer Products                0
Sales                                     0
Science & Technology                      0
Trades & Services                         0
dtype: int64

# English Stop Words
with open('stopwords.txt') as f:
    en_words = f.readlines()
    en_words = [words.replace('\n', '') for words in en_words]
    
df_stop = pd.read_sql('SELECT * FROM jobs', conn)

# Company Stop Words
company_names = df_stop['companyName'].dropna().tolist()
all_phrases = ' '.join(company_names)
words_without_punctuation = re.findall(r'\b\w+\b', all_phrases)
comp_stopwords = [word.lower() for word in words_without_punctuation]

# SKLearn English Stop Words
sklearn_en = list(ENGLISH_STOP_WORDS)

# Business Words
business_words = ['citi', 'workday', 'bonifacio', 'senior', 'specialist', 
                  'officer', 'associate', 'assistant', 'hmo', 'work', 'plaza', 
                  'officers', 'opening', 'openings', 'us', 'll', 'join', 
                  'joins', 'pasig', 'azure', 'japanese', 'role', 'apply', 
                  'day', 'benefits', 'job', 'time', 'description', 
                  'experience', 'taguig', 'workday', 'complete', 'application', 
                  'reminder', 'skills', 'required']

# Warning Words
warn_words = ['ek', 'five', 'fr', 'inkjets', 'millennium', 'r', 'sec', 'x']

# English Stop Words
with open('stopwords.txt') as f:
    en_words = f.readlines()
    en_words = [words.replace('\n', '') for words in en_words]
    
df_stop = pd.read_sql('SELECT * FROM jobs', conn)

# Company Stop Words
company_names = df_stop['companyName'].dropna().tolist()
all_phrases = ' '.join(company_names)
words_without_punctuation = re.findall(r'\b\w+\b', all_phrases)
comp_stopwords = [word.lower() for word in words_without_punctuation]

# SKLearn English Stop Words
sklearn_en = list(ENGLISH_STOP_WORDS)

# Business Words
business_words = ['citi', 'workday', 'bonifacio', 'senior', 'specialist', 
                  'officer', 'associate', 'assistant', 'hmo', 'work', 'plaza', 
                  'officers', 'opening', 'openings', 'us', 'll', 'join', 
                  'joins', 'pasig', 'azure', 'japanese', 'role', 'apply', 
                  'day', 'benefits', 'job', 'time', 'description', 
                  'experience', 'taguig', 'workday', 'complete', 'application', 
                  'reminder', 'skills', 'required']

# Warning Words
warn_words = ['ek', 'five', 'fr', 'inkjets', 'millennium', 'r', 'sec', 'x']

combined_stopwords = (en_words + comp_stopwords + sklearn_en + 
    business_words + warn_words)

tfidf_vectorizer = TfidfVectorizer(token_pattern=r'[a-z]+',
                                   stop_words=combined_stopwords,
                                   min_df=.05, max_df=.95,
                                   ngram_range=(1,2))

details_bow = tfidf_vectorizer.fit_transform(df['skill'])

tfidf_vectorizer = TfidfVectorizer(token_pattern=r'[a-z]+',
                                   stop_words=combined_stopwords,
                                   min_df=.05, max_df=.95,
                                   ngram_range=(1,2))

details_bow = tfidf_vectorizer.fit_transform(df['skill'])

skills_df = pd.DataFrame.sparse.from_spmatrix(details_bow,
                    columns=tfidf_vectorizer.get_feature_names_out())
skills_df.head(3)

new_df = df.drop('skill', axis=1)
new_df.reset_index(inplace=True, drop=True)
skills_df.reset_index(inplace=True, drop=True)
new_df = pd.concat((new_df, skills_df), axis=1)
new_df.head(3)

q, s, p, nssd = truncated_svd(new_df.to_numpy())

nssd_cum(nssd)

%%capture
nssd.cumsum()[70] # Add 1

X_new = new_df.to_numpy().dot(p[:, :2])
plot_svd(X_new, new_df.columns, p)

feature_names = new_df.columns
fig, ax = plt.subplots()
order = np.argsort(np.abs(p[:, 0]))[-15:]
ax.barh([feature_names[o] for o in order], p[order, 0],
       color=['gray', 'gray', 'gray', 'gray', 'gray', 'gray',
              'gray', 'gray', 'gray', 'gray', 'gray', 'gray',
              'black', 'black', 'black'])
ax.set_title(f'SV{0+1}')
plt.show()

word_cloud_svd(0)

feature_names = new_df.columns
fig, ax = plt.subplots()
order = np.argsort(np.abs(p[:, 1]))[-15:]
ax.barh([feature_names[o] for o in order], p[order, 1],
       color=['black', 'gray', 'gray', 'gray', 'gray', 'black',
              'gray', 'gray', 'black', 'gray', 'gray', 'gray',
              'black', 'black', 'black'])
ax.set_title(f'SV{1+1}')
plt.show()

word_cloud_svd(1)

feature_names = new_df.columns
fig, ax = plt.subplots()
order = np.argsort(np.abs(p[:, 2]))[-15:]
ax.barh([feature_names[o] for o in order], p[order, 2],
       color=['gray', 'gray', 'gray', 'gray', 'gray', 'gray',
              'gray', 'gray', 'gray', 'gray', 'gray', 'gray',
              'black', 'black', 'black'])
ax.set_title(f'SV{2+1}')
plt.show()

word_cloud_svd(2)

feature_names = new_df.columns
fig, ax = plt.subplots()
order = np.argsort(np.abs(p[:, 3]))[-15:]
ax.barh([feature_names[o] for o in order], p[order, 3],
       color=['gray', 'gray', 'gray', 'gray', 'gray', 'black',
              'black', 'black', 'black', 'black', 'black', 'black',
              'black', 'black', 'black'])
ax.set_title(f'SV{3+1}')
plt.show()

word_cloud_svd(3)

feature_names = new_df.columns
fig, ax = plt.subplots()
order = np.argsort(np.abs(p[:, 4]))[-15:]
ax.barh([feature_names[o] for o in order], p[order, 4],
       color=['gray', 'gray', 'gray', 'gray', 'gray', 'gray',
              'gray', 'gray', 'gray', 'gray', 'black', 'black',
              'black', 'black', 'black'])
ax.set_title(f'SV{4+1}')
plt.show()

word_cloud_svd(4)

feature_names = new_df.columns
fig, ax = plt.subplots()
order = np.argsort(np.abs(p[:, 5]))[-15:]
ax.barh([feature_names[o] for o in order], p[order, 5],
       color=['gray', 'gray', 'gray', 'gray', 'gray', 'black',
              'black', 'black', 'black', 'black', 'black', 'black',
              'black', 'black', 'black'])
ax.set_title(f'SV{5+1}')
plt.show()

word_cloud_svd(5)

feature_names = new_df.columns
fig, ax = plt.subplots()
order = np.argsort(np.abs(p[:, 6]))[-15:]
ax.barh([feature_names[o] for o in order], p[order, 6],
       color=['gray', 'black', 'gray', 'black', 'gray', 'gray',
              'gray', 'gray', 'black', 'black', 'black', 'black',
              'black', 'black', 'black'])
ax.set_title(f'SV{6+1}')
plt.show()

word_cloud_svd(6)

feature_names = new_df.columns
fig, ax = plt.subplots()
order = np.argsort(np.abs(p[:, 7]))[-15:]
ax.barh([feature_names[o] for o in order], p[order, 7],
       color=['gray', 'gray', 'black', 'black', 'gray', 'gray',
              'gray', 'gray', 'gray', 'black', 'black', 'black',
              'black', 'black', 'black'])
ax.set_title(f'SV{7+1}')
plt.show()

word_cloud_svd(7)

feature_names = new_df.columns
fig, ax = plt.subplots()
order = np.argsort(np.abs(p[:, 8]))[-15:]
ax.barh([feature_names[o] for o in order], p[order, 8],
       color=['gray', 'gray', 'gray', 'gray', 'black', 'gray',
              'gray', 'gray', 'gray', 'black', 'black', 'gray',
              'black', 'black', 'black'])
ax.set_title(f'SV{8+1}')
plt.show()

word_cloud_svd(8)

feature_names = new_df.columns
fig, ax = plt.subplots()
order = np.argsort(np.abs(p[:, 9]))[-15:]
ax.barh([feature_names[o] for o in order], p[order, 9],
       color=['gray', 'gray', 'gray', 'gray', 'gray', 'gray',
              'gray', 'gray', 'black', 'gray', 'black', 'black',
              'black', 'black', 'black'])
ax.set_title(f'SV{9+1}')
plt.show()

word_cloud_svd(9)

	jobTitle	jobId	jobDetails
0	Data Analyst	71648955	Description ISTA Solutions an outsourcing offs...
1	NIGHTSHIFT \| DATA ANALYST (POWER BI) \| WFH	71644101	DATA ANALYST Work for our global clients and i...
2	Data Analyst	71571071	We are Gold Coast s leading pharmacy group wit...

	classification	companyName	location	id	jobLocation	roleId	salary	subClassification	teaser	title	workType	currencyLabel	suburb
0	{'id': '6281', 'description': 'Information & C...	ISTA Solutions, Inc.	Makati City	71648955	{'label': 'Makati City, Metro Manila', 'countr...	data-analyst	₱30,000 – ₱35,000 per month	{'id': '6283', 'description': 'Business/System...	Data Analytics	Data Analyst	Full time	None	None
1	{'id': '1223', 'description': 'Science & Techn...	Satellite Office	Taguig City	71644101	{'label': 'Taguig City, Metro Manila', 'countr...	data-analyst	None	{'id': '6222', 'description': 'Mathematics, St...	DATA ANALYSTWork for our global clients and im...	NIGHTSHIFT \| DATA ANALYST (POWER BI) \| WFH	Full time	None	None
2	{'id': '6281', 'description': 'Information & C...	Chempro	Metro Manila	71571071	{'label': 'Metro Manila', 'countryCode': 'PH',...	data-analyst	₱35,000 – ₱45,000 per month	{'id': '6283', 'description': 'Business/System...	The successful candidate must have a good unde...	Data Analyst	Full time	None	None

	companyName	location	id	roleId	salary	teaser	title	workType	currencyLabel	suburb	classification_description	subClassification_description	jobLocation_label
0	ISTA Solutions, Inc.	Makati City	71648955	data-analyst	₱30,000 – ₱35,000 per month	Data Analytics	Data Analyst	Full time	None	None	Information & Communication Technology	Business/Systems Analysts	Makati City, Metro Manila
1	Satellite Office	Taguig City	71644101	data-analyst	None	DATA ANALYSTWork for our global clients and im...	NIGHTSHIFT \| DATA ANALYST (POWER BI) \| WFH	Full time	None	None	Science & Technology	Mathematics, Statistics & Information Sciences	Taguig City, Metro Manila
2	Chempro	Metro Manila	71571071	data-analyst	₱35,000 – ₱45,000 per month	The successful candidate must have a good unde...	Data Analyst	Full time	None	None	Information & Communication Technology	Business/Systems Analysts	Metro Manila

	companyName	location	id	roleId	salary	teaser	workType	currencyLabel	suburb	classification_description	subClassification_description	jobLocation_label	jobTitle	jobId	jobDetails
0	ISTA Solutions, Inc.	Makati City	71648955	data-analyst	₱30,000 – ₱35,000 per month	Data Analytics	Full time	None	None	Information & Communication Technology	Business/Systems Analysts	Makati City, Metro Manila	Data Analyst	71648955	Description ISTA Solutions an outsourcing offs...
1	Satellite Office	Taguig City	71644101	data-analyst	None	DATA ANALYSTWork for our global clients and im...	Full time	None	None	Science & Technology	Mathematics, Statistics & Information Sciences	Taguig City, Metro Manila	NIGHTSHIFT \| DATA ANALYST (POWER BI) \| WFH	71644101	DATA ANALYST Work for our global clients and i...
2	Chempro	Metro Manila	71571071	data-analyst	₱35,000 – ₱45,000 per month	The successful candidate must have a good unde...	Full time	None	None	Information & Communication Technology	Business/Systems Analysts	Metro Manila	Data Analyst	71571071	We are Gold Coast s leading pharmacy group wit...

	id	jobTitle	jobDetails	companyName	location	roleId	salary	teaser	workType	currencyLabel	suburb	classification	subClassification	jobLocation
0	71648955	Data Analyst	Description ISTA Solutions an outsourcing offs...	ISTA Solutions, Inc.	Makati City	data-analyst	₱30,000 – ₱35,000 per month	Data Analytics	Full time	None	None	Information & Communication Technology	Business/Systems Analysts	Makati City, Metro Manila
1	71644101	NIGHTSHIFT \| DATA ANALYST (POWER BI) \| WFH	DATA ANALYST Work for our global clients and i...	Satellite Office	Taguig City	data-analyst	None	DATA ANALYSTWork for our global clients and im...	Full time	None	None	Science & Technology	Mathematics, Statistics & Information Sciences	Taguig City, Metro Manila
2	71571071	Data Analyst	We are Gold Coast s leading pharmacy group wit...	Chempro	Metro Manila	data-analyst	₱35,000 – ₱45,000 per month	The successful candidate must have a good unde...	Full time	None	None	Information & Communication Technology	Business/Systems Analysts	Metro Manila

Import Libraries

Define Functions

Data Source

Data Description

Data Exploration

Data Collection

Data Preprocessing

Dimensionality Reduction & Analysis

Summary and Insights

	advertiser	branding	bulletPoints	classification	companyName	companyProfileStructuredDataId	displayStyle	displayType	listingDateDisplay	...	tags	currencyLabel	suburb	suburbId	suburbWhereValue	searchInsights	area	areaId	areaWhereValue
0	{'id': '60350712', 'description': 'ISTA Soluti...	{'id': 'f460edef-5f55-4281-949f-cc047938f9ba.1...	[]	{'id': '6281', 'description': 'Information & C...	ISTA Solutions, Inc.	429685	{'search': 'A'}	standout	21h ago	...	None	None	None	NaN	None	None	None	NaN	None
1	{'id': '60282266', 'description': 'Satellite O...	{'id': '11d87da4-efee-4415-8f94-58fecb763ffa.1...	[]	{'id': '1223', 'description': 'Science & Techn...	Satellite Office	307252	{'search': 'A'}	standout	1d ago	...	None	None	None	NaN	None	None	None	NaN	None
2	{'id': '60241355', 'description': 'Chempro QLD...	{'id': '9b5fcf2b-15a5-4c75-9a26-940c3484f762.1...	['HMO for you and one dependent', 'Up to 23 le...	{'id': '6281', 'description': 'Information & C...	Chempro	353343	{'search': 'A'}	standout	5d ago	...	None	None	None	NaN	None	None	None	NaN	None

	id	jobTitle	jobDetails	companyName	location	roleId	salary	teaser	workType	currencyLabel	suburb	classification	subClassification	jobLocation
0	70301897	URGENT! \| Real Time Analyst \| Pampanga Site	JOB SUMMARY:Intraday specialist is a critical ...	Tata Consultancy Services	Central Luzon	real-time-analyst	None	JOB SUMMARY:Intraday specialist is a critical ...	Full time	None	None	Call Centre & Customer Service	Management & Support	Central Luzon
1	70307629	Accountant	Candidate must possess at least Bachelor's/Col...	None	Quezon City	accountant	₱20,000 – ₱23,000 per month	Candidate must possess at least Bachelor's/Col...	Full time	None	None	Accounting	Financial Accounting & Reporting	Quezon City, Metro Manila
2	70315694	Financial Reporting Analyst	QualificationsBachelor of Science in Accountan...	Knowles Electronics (Philippines) Corporation	Central Visayas	financial-reporting-analyst	None	Qualifications Bachelor of Science in Accounta...	Full time	None	None	Accounting	Financial Accounting & Reporting	Central Visayas

	classification	skill
0	Call Centre & Customer Service	Audits Managing Intraday Plan and communicatio...
1	Accounting	Honest, Hardworking, Dependable, Preferably CP...
2	Accounting	and both oral and written communication skills...

	accounting	activities	...	willing	word	written	written communication
0	0.000000	0.000000	...	0.143887	0.169077	0.000000	0.000000
1	0.288326	0.000000	...	0.000000	0.000000	0.000000	0.000000
2	0.287454	0.135782	...	0.121639	0.000000	0.099399	0.145639

SV	Job Segment	Required Skills	Summary
SV1	Technical Client Support Roles	- Strong communicator - Technical skills and knowledge of relevant tools - SQL proficiency - Ability to deal with clients	Hirees need to be technically knowledgeable and strong communicators, especially with clients.
SV2	Client-Centric Business Services Roles	- Accounting or financial knowledge - Strong communicator - Ability to deal with clients - Excel proficiency - Attention to detail - Good in written communication	Desired hirees should be technically knowledgeable, strong communicators, and proficient in Excel.
SV3	Customer-Centric Professions	- Amenable and friendly - Customer-centric - Organized	The focus is on dealing with customers, requiring the ability to engage with people effectively.
SV4	Client-Centric Professions	- Ability to deal with clients - Improve performance that satisfies clients	Emphasis on working with clients, providing a strong focus on client interaction.
SV5	Empathy-Communication Spectrum	- Willingness to improve lives (Healthcare & Medical) - Strong communication skills (HR, Marketing, Banking, Admin)	Represents the continuum between making a positive impact on lives and effective communication.
SV6	Serving Customers vs. Candidates	- Customer-centric roles (Marketing, Sales, Healthcare, etc.) - Candidate-centric roles (HR, Accounting, IT, Call Centre)	Highlights the distinction between roles focusing on customers/clients and those on candidates.
SV7	Service and Communication to Technical Expertise Spectrum	- Communication-focused roles (Marketing, HR, Accounting, etc.) - Technical roles (Banking, Manufacturing, Logistics, etc.)	Emphasizes the shift from communication/service-oriented roles to those requiring technical expertise.
SV8	Technical Operations to Professional Services Continuum	- Interpersonal skills - Strong communication skills - Excel proficiency - Good planner (Admin, Manufacturing, Logistics, etc.)	Represents the spectrum from roles emphasizing operational skills to those in professional services.
SV9	Strategic Leadership to Technical Expertise Continuum	- Good planner - Strong interpersonal skills - Future-thinking - Team player - Strong communication skills (Manufacturing, Banking, Marketing, etc.)	Captures the range from roles prioritizing strategic planning to those emphasizing technical expertise.
SV10	Analytical vs Administrative Professions	- Motivation - Analytical skills - Presentation skills (Sales) - Excel and Word proficiency - Future-thinking (Admin, Manufacturing, Healthcare, etc.)	Represents the continuum from roles requiring motivation and analytics to those emphasizing administrative proficiency.