-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscores.py
740 lines (607 loc) · 36.2 KB
/
scores.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
import pandas as pd
import mysql.connector
import datetime
import requests
import os
from fuzzywuzzy import fuzz
class DBConnector:
"""
A class for connecting to a MySQL database and loading tables as pandas DataFrames.
Attributes:
----------
host : str
The hostname of the MySQL server.
username : str
The username to use for authentication.
password : str
The password to use for authentication.
database : str
The name of the database to connect to.
Methods:
-------
load_tables() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]
Connects to the MySQL database and loads the candidate, degree_bsc, degree_master, and degree_phd tables
as pandas DataFrames. Returns a tuple containing the DataFrames.
"""
def __init__(self, host, username, password, database):
self.host = host
self.username = username
self.password = password
self.database = database
def load_tables(self):
"""
Connects to the MySQL database and loads the candidate, degree_bsc, degree_master, and degree_phd tables
as pandas DataFrames. Returns a tuple containing the DataFrames.
Returns:
-------
Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]
A tuple containing the candidate, degree_bsc, degree_master, and degree_phd DataFrames.
"""
# Connect to the database
cnx = mysql.connector.connect(
host=self.host,
user=self.username,
password=self.password,
database=self.database
)
# Load candidate table as pandas DataFrame
candidate_df = pd.read_sql_query("SELECT * FROM candidate", cnx)
# Load degree_bsc table as pandas DataFrame
degree_bsc_df = pd.read_sql_query("SELECT * FROM degree_bsc", cnx)
# Load degree_master table as pandas DataFrame
degree_master_df = pd.read_sql_query("SELECT * FROM degree_master", cnx)
# Load degree_phd table as pandas DataFrame
degree_phd_df = pd.read_sql_query("SELECT * FROM dergee_phd", cnx)
# Load teaching_exp table as pandas DataFrame
teaching_exp_df = pd.read_sql_query("SELECT * FROM teaching_exp", cnx)
# Load industry_exp table as pandas DataFrame
industry_exp_df = pd.read_sql_query("SELECT * FROM industry_exp", cnx)
# Load patent_exp table as pandas DataFrame
patents_df = pd.read_sql_query("SELECT * FROM patents", cnx)
# Load supervision_bsc table as pandas DataFrame
supervision_bsc_df = pd.read_sql_query("SELECT * FROM supervision_bsc", cnx)
# Load supervision_masters table as pandas DataFrame
supervision_masters_df = pd.read_sql_query("SELECT * FROM supervision_master", cnx)
# Load supervision_phd table as pandas DataFrame
supervision_phd_df = pd.read_sql_query("SELECT * FROM supervision_phd", cnx)
# Load committee_work table as pandas DataFrame
committee_work_df = pd.read_sql_query("SELECT * FROM committee_work", cnx)
# Load quality_accreditation table as pandas DataFrame
quality_accreditation_df = pd.read_sql_query("SELECT * FROM quality_accreditation", cnx)
# Load certificates table as pandas DataFrame
certificates_df = pd.read_sql_query("SELECT * FROM certificates", cnx)
# Load awards table as pandas DataFrame
awards_df = pd.read_sql_query("SELECT * FROM awards", cnx)
# Load funded_research table as pandas DataFrame
funded_research_df = pd.read_sql_query("SELECT * FROM funded_research", cnx)
# Load citation table as pandas DataFrame
citation_df = pd.read_sql_query("SELECT * FROM citation", cnx)
# Close the connection
cnx.close()
# Return the DataFrames
return candidate_df, degree_bsc_df, degree_master_df, degree_phd_df, teaching_exp_df, industry_exp_df, patents_df, supervision_bsc_df, supervision_masters_df, supervision_phd_df, committee_work_df, quality_accreditation_df, certificates_df, awards_df, funded_research_df, citation_df
class ScoreCalculator(DBConnector):
"""
Class to assign a numerical score to a candidates application based on 5 major areas of past
performance i.e. university rankings from which the candidate studied, teaching exp, industry exp,
technical publications, and other variables such as patents, funded research, and committee work etc.
Attributes:
-----------
host: str
String containing the name of the server hosting the instance of MySQL db
username: str
String containing the username of a user added to the database
password: str
String containing the password of a user added to the database
database: str
String containing the name of the MySQL database
candidate_df : pandas.DataFrame
DataFrame containing candidate information including candidate_id
degree_bsc_df : pandas.DataFrame
DataFrame containing Bachelor's degree information including QS_uni_rank_bsc
degree_master_df : pandas.DataFrame
DataFrame containing Master's degree information including QS_uni_rank_master
degree_phd_df : pandas.DataFrame
DataFrame containing PhD degree information including QS_uni_rank_phd
teaching_exp_df: pandas.DataFrame
DataFrame containing teaching expreience information
teaching_exp_df: pandas.DataFrame
DataFrame containing teaching expreience information
inudstry_exp_df: pandas.DataFrame
DataFrame containing industry expreience information
patents_df: pandas.DataFrame
DataFrame containing patents information
supervision_bsc_df: pandas.DataFrame
DataFrame containing bsc supervision expreience information
supervision_masters_df: pandas.DataFrame
DataFrame containing masters supervision expreience information
supervision_phd_df: pandas.DataFrame
DataFrame containing phd supervision expreience information
committee_work_df: pandas.DataFrame
DataFrame containing committee work expreience information
quality_accreditation_df: pandas.DataFrame
DataFrame containing QA expreience information
certificates_df: pandas.DataFrame
DataFrame containing certificates information
awards_df: pandas.DataFrame
DataFrame containing awards information
funded_research_df: pandas.DataFrame
DataFrame containing funded research information
citation_df: pandas.DataFrame
DataFrame containing technical publications information
"""
# University ranking vars
MAX_SCORE_WITH_PHD_QS_LT_100 = 15
MAX_SCORE_WITH_PHD_QS_GT_100 = 11.5
NO_PHD_MAX_SCORE = 5
DEDUCTION = 5
# Teaching exp vars
MAX_SCORE_NON_ARABIC_PER_YEAR = 3
MAX_SCORE_ARABIC_PER_YEAR = 2
# Industry exp vars
MAX_IND_EXP_SCORE_PER_YEAR = 1
def __init__(self, host,username,password,database, candidate_df, degree_bsc_df, degree_master_df, degree_phd_df, teaching_exp_df, industry_exp_df, patents_df, supervision_bsc_df, supervision_masters_df, supervision_phd_df, committee_work_df, quality_accreditation_df, certificates_df, awards_df, funded_research_df, citation_df):
super().__init__(host, username, password, database) # inherit host, username, pass, and db parameters from DBConnector class
self.candidate_df = candidate_df
self.degree_bsc_df = degree_bsc_df
self.degree_master_df = degree_master_df
self.degree_phd_df = degree_phd_df
self.teaching_exp_df = teaching_exp_df
self.industry_exp_df = industry_exp_df
self.patents_df = patents_df
self.supervision_bsc_df = supervision_bsc_df
self.supervision_masters_df = supervision_masters_df
self.supervision_phd_df = supervision_phd_df
self.committee_work_df = committee_work_df
self.quality_accreditation_df = quality_accreditation_df
self.certificates_df = certificates_df
self.awards_df = awards_df
self.funded_research_df = funded_research_df
self.citation_df = citation_df
self.journal_ranks = pd.read_csv('journal_ranks.csv')
def university_score(self):
"""
Calculates the score for a candidate based on the ranking of the universities
from which the candidate got different degrees from
Returns:
--------
pandas.DataFrame
df containing the university of graduation score for each candidate
"""
scores = {}
for candidate_id in self.candidate_df['candidate_id']:
# Has PHD
if candidate_id in self.degree_phd_df['candidate_id'].to_list():
# Case 1: Phd degree qs <100 and bsc, masters <100
if self.degree_phd_df.loc[self.degree_phd_df['candidate_id']==candidate_id, 'QS_uni_rank_phd'].iloc[0] <= 100:
if (self.degree_master_df.loc[self.degree_master_df['candidate_id']==candidate_id, 'QS_uni_rank_master'].iloc[0] <= 100) and \
(self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] <= 100):
scores[candidate_id] = self.MAX_SCORE_WITH_PHD_QS_LT_100
continue
# Case 2: phd <100 and master > 100, bsc <100
elif (self.degree_master_df.loc[self.degree_master_df['candidate_id']==candidate_id, 'QS_uni_rank_master'].iloc[0] > 100) and \
(self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] < 100):
scores[candidate_id] = self.MAX_SCORE_WITH_PHD_QS_LT_100 - self.DEDUCTION
continue
# Case 3: phd <100 and bsc >100, masters <100
elif self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] > 100 and \
(self.degree_master_df.loc[self.degree_master_df['candidate_id']==candidate_id, 'QS_uni_rank_master'].iloc[0] < 100):
scores[candidate_id] = self.MAX_SCORE_WITH_PHD_QS_LT_100 - self.DEDUCTION
continue
# Case 4: phd <100 and master < 100 , bsc >100
elif (self.degree_master_df.loc[self.degree_master_df['candidate_id']==candidate_id, 'QS_uni_rank_master'].iloc[0] < 100) and \
(self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] > 100):
scores[candidate_id] = self.MAX_SCORE_WITH_PHD_QS_LT_100 - self.DEDUCTION
continue
# Case 5: phd <100 and bsc <100 , masters >100
elif self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] < 100 and \
(self.degree_master_df.loc[self.degree_master_df['candidate_id']==candidate_id, 'QS_uni_rank_master'].iloc[0] > 100):
scores[candidate_id] = self.MAX_SCORE_WITH_PHD_QS_LT_100 - self.DEDUCTION
continue
# Case 6: Phd degree qs < 100 and bsc, masters >100
else:
scores[candidate_id] = self.MAX_SCORE_WITH_PHD_QS_LT_100 - self.DEDUCTION*1.2
continue
# Case 7: phd degree qs>100 and masters bsc <100
else:
if (self.degree_master_df.loc[self.degree_master_df['candidate_id']==candidate_id, 'QS_uni_rank_master'].iloc[0] <= 100) and \
(self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] <= 100):
scores[candidate_id] = self.MAX_SCORE_WITH_PHD_QS_GT_100
continue
# Case 8: phd degree qs > 100 and master > 100 , bsc <100
elif (self.degree_master_df.loc[self.degree_master_df['candidate_id']==candidate_id, 'QS_uni_rank_master'].iloc[0] > 100) and \
(self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] < 100):
scores[candidate_id] = self.MAX_SCORE_WITH_PHD_QS_GT_100 - self.DEDUCTION
continue
# Case 9: phd degree qs > 100 and bsc > 100 , master <100
elif self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] > 100 and \
(self.degree_master_df.loc[self.degree_master_df['candidate_id']==candidate_id, 'QS_uni_rank_master'].iloc[0] < 100):
scores[candidate_id] = self.MAX_SCORE_WITH_PHD_QS_GT_100 - self.DEDUCTION
continue
# Case 10: phd degree qs > 100 and master < 100 , bsc >100
elif (self.degree_master_df.loc[self.degree_master_df['candidate_id']==candidate_id, 'QS_uni_rank_master'].iloc[0] < 100) and \
(self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] > 100):
scores[candidate_id] = self.MAX_SCORE_WITH_PHD_QS_GT_100 - self.DEDUCTION
continue
# Case 11: phd degree qs > 100 and bsc < 100 , master >100
elif self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] < 100 and \
(self.degree_master_df.loc[self.degree_master_df['candidate_id']==candidate_id, 'QS_uni_rank_master'].iloc[0] > 100):
scores[candidate_id] = self.MAX_SCORE_WITH_PHD_QS_GT_100 - self.DEDUCTION
continue
# Case 12: phd degree qs > 100 and bsc, masters > 100
else:
scores[candidate_id] = self.MAX_SCORE_WITH_PHD_QS_GT_100 - 2*self.DEDUCTION/2
continue
# No PHD
else:
# No Masters
if (self.degree_master_df.loc[self.degree_master_df['candidate_id']==candidate_id].empty):
# Case 13: no masters and bsc <100
if (self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] < 100):
scores[candidate_id] = self.NO_PHD_MAX_SCORE -self.DEDUCTION/2
# Case 14: no masters and bsc >100
elif (self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] > 100):
scores[candidate_id] = self.DEDUCTION/3
# No PHD has Masters
else:
# Case 15: both masters and bsc qs <100
if (self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] < 100) and (self.degree_master_df.loc[self.degree_master_df['candidate_id']==candidate_id, 'QS_uni_rank_master'].iloc[0] < 100):
scores[candidate_id] = self.NO_PHD_MAX_SCORE
continue
# Case 16: bsc qs <100 , masters >100
elif (self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] < 100) and (self.degree_master_df.loc[self.degree_master_df['candidate_id']==candidate_id, 'QS_uni_rank_master'].iloc[0] > 100):
scores[candidate_id] = self.NO_PHD_MAX_SCORE - self.DEDUCTION/3.5
continue
# Case 17: master qs <100 , bsc >100
elif (self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] > 100) and (self.degree_master_df.loc[self.degree_master_df['candidate_id']==candidate_id, 'QS_uni_rank_master'].iloc[0] < 100):
scores[candidate_id] = self.NO_PHD_MAX_SCORE -self.DEDUCTION/3.5
# Case 16: bsc qs >100, masters <100
elif (self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] > 100) and (self.degree_master_df.loc[self.degree_master_df['candidate_id']==candidate_id, 'QS_uni_rank_master'].iloc[0] < 100):
scores[candidate_id] = self.NO_PHD_MAX_SCORE - self.DEDUCTION/3.5
continue
# Case 17: master qs >100, bsc <100
elif (self.degree_bsc_df.loc[self.degree_bsc_df['candidate_id']==candidate_id, 'QS_uni_rank_bsc'].iloc[0] < 100) and (self.degree_master_df.loc[self.degree_master_df['candidate_id']==candidate_id, 'QS_uni_rank_master'].iloc[0] > 100):
scores[candidate_id] = self.NO_PHD_MAX_SCORE -self.DEDUCTION/3.5
# Case 18: both masters and bsc qs>100
else:
scores[candidate_id] = self.NO_PHD_MAX_SCORE - self.DEDUCTION/2
return pd.DataFrame(scores.items(), columns=['candidate_id', 'uni_ranking_score'])
def teaching_expereince_score(self):
"""
Calculates the teaching experience score for each candidate in the candidate dataframe.
Returns:
A pandas dataframe containing the candidate IDs and their corresponding teaching experience scores.
"""
# Create a list of Arab countries
arabic_speaking_countries = ["Algeria", "Bahrain", "Comoros", "Djibouti", "Egypt", "Iraq", "Jordan", "Kuwait", "Lebanon", "Libya", "Mauritania", "Morocco", "Oman", "Palestine", "Qatar", "Saudi Arabia", "Somalia", "Sudan", "Syria", "Tunisia", "United Arab Emirates", "Yemen"]
scores = {}
for candidate_id in self.candidate_df['candidate_id']:
# Filter teaching_exp_df for each candidate
candidate_teaching_exp = self.teaching_exp_df[self.teaching_exp_df['candidate_id']==candidate_id]
candidate_score = 0
for _, row in candidate_teaching_exp.iterrows():
# Convert the date strings to datetime objects
if row['teaching_current_position'] == 'yes':
end_date = datetime.date.today()
else:
end_date = pd.to_datetime(row['teaching_to_end_date']).date()
start_date = pd.to_datetime(row['teaching_from_start_date']).date()
# Calculate the duration in years and append to the durations list
duration_years = (end_date - start_date).days/365.25
# Cap duration to a maximum of 5 years
if duration_years >5:
duration_years = 5
else:
duration_years = duration_years
teaching_exp_country = row['teachingexp_country']
if teaching_exp_country.lower() in [arab_country.lower() for arab_country in arabic_speaking_countries]:
current_score = self.MAX_SCORE_ARABIC_PER_YEAR*duration_years
candidate_score += current_score
else:
current_score = self.MAX_SCORE_NON_ARABIC_PER_YEAR * duration_years
candidate_score += current_score
# Cap overall maximum teaching score to 15
if candidate_score > 15.0:
scores[candidate_id] = 15
else:
scores[candidate_id] = candidate_score
return pd.DataFrame(scores.items(), columns=['candidate_id', 'teaching_exp_score'])
def industry_experience_score(self):
"""
Computes the industry experience score for each candidate in the candidate dataframe.
Returns:
- pandas DataFrame: a dataframe with two columns: 'candidate_id' and 'industry_exp_score'.
"""
scores = {}
for candidate_id in self.candidate_df['candidate_id']:
# Filter industry_exp_df for each candidate
candidate_ind_exp = self.industry_exp_df[self.industry_exp_df['candidate_id']==candidate_id]
candidate_score = 0
for _, row in candidate_ind_exp.iterrows():
# Convert the date strings to datetime objects
if row['industry_current_position'] == 'yes':
end_date = datetime.date.today()
else:
end_date = pd.to_datetime(row['industry_to_end_date']).date()
start_date = pd.to_datetime(row['industry_from_start_date']).date()
# Calculate the duration in years
duration_years = (end_date - start_date).days/365.25
# Cap duration to a maximum of 5 years
if duration_years >5:
duration_years = 5
else:
duration_years = duration_years
current_score = self.MAX_IND_EXP_SCORE_PER_YEAR*duration_years
candidate_score += current_score
# Cap overall maximum industry score to 5
if candidate_score > 5.0:
scores[candidate_id] = 5
else:
scores[candidate_id] = candidate_score
return pd.DataFrame(scores.items(), columns=['candidate_id', 'industry_exp_score'])
def others_score(self):
"""
Calculate the "Others" score for each candidate based on their patents, supervision,
committee work, quality accreditation, certificates, awards, management experience,
and funded research.
Returns:
--------
A pandas DataFrame containing the "Others" score for each candidate and the scores for each component (patents, supervision, committee work, quality accreditation, certificates, awards, management experience, and funded research).
"""
patents_scores = {}
supervision_scores = {}
committee_work_scores = {}
quality_accreditation_scores = {}
certificates_scores = {}
awards_scores = {}
funded_research_scores = {}
management_exp_scores = {}
funded_research_total_per_candidate = {}
scores = {}
for candidate_id in self.candidate_df['candidate_id'].unique().tolist():
# Patents
if candidate_id in self.patents_df['candidate_id']:
patents_scores[candidate_id] = 2
else:
patents_scores[candidate_id] = 0
# Supervision
if candidate_id in self.supervision_bsc_df['candidate_id'].unique().tolist() or self.supervision_masters_df['candidate_id'].unique().tolist() or self.supervision_phd_df['candidate_id'].unique().tolist():
supervision_scores[candidate_id] = 2
else:
supervision_scores[candidate_id] = 0
# Comittee Work
if candidate_id in self.committee_work_df['candidate_id'].unique().tolist():
committee_work_scores[candidate_id] = 1
else:
committee_work_scores[candidate_id] = 0
# Quality Accreditation
if candidate_id in self.quality_accreditation_df['candidate_id'].unique().tolist():
quality_accreditation_scores[candidate_id] = 1
else:
quality_accreditation_scores[candidate_id] = 0
# Certificates
if candidate_id in self.certificates_df['candidate_id'].unique().tolist():
certificates_scores[candidate_id] = 1
else:
certificates_scores[candidate_id] = 0
# Awards
if candidate_id in self.awards_df['candidate_id'].unique().tolist():
awards_scores[candidate_id] = 1
else:
awards_scores[candidate_id] = 0
# Management Exp
if 'yes' in self.teaching_exp_df[self.teaching_exp_df['candidate_id']==candidate_id]['teaching_administrative_position'].tolist() or 'yes' in self.industry_exp_df[self.industry_exp_df['candidate_id']==candidate_id]['industry_administritive_position'].tolist():
management_exp_scores[candidate_id] = 1
else:
management_exp_scores[candidate_id] = 0
# Funded Research
funded_total_amount = self.funded_research_df[self.funded_research_df['candidate_id']==candidate_id]['funded_amount_usd'].sum()
funded_research_total_per_candidate[candidate_id] = funded_total_amount
max_funded_amount = max(funded_research_total_per_candidate.values())
for candidate_id in self.candidate_df['candidate_id'].unique().tolist():
candidate_funded_amount = self.funded_research_df[self.funded_research_df['candidate_id']==candidate_id]['funded_amount_usd'].sum()
if max_funded_amount !=0:
funded_research_scores[candidate_id] = (candidate_funded_amount/max_funded_amount)*2
# Real time entries; funded research amt can be zero for few; avoid error in that case
else:
funded_research_scores[candidate_id] = 0
scores = {"patent_others":patents_scores, "supervision_others":supervision_scores,
"committe_others":committee_work_scores,"qa_others": quality_accreditation_scores,
"certificates_others":certificates_scores,"awards_others":awards_scores,
"managemnet_exp_others":management_exp_scores,"funded_research_others":funded_research_scores}
# create the others output dataframe
df = pd.DataFrame(scores)
# Sum components of others to get others_total
df['others_total'] = df.sum(axis=1)
# Add candidate_id col
df['candidate_id'] = self.candidate_df['candidate_id'].unique()
return pd.DataFrame(df)
def __journal_name_parser(self, technical_publication) -> str:
"""
Extracts the name of the journal from a technical publication.
Parameters:
technical_publication (str): The text of the technical publication.
Returns:
str: The name of the journal if found, or an empty string if not.
Raises:
Exception: If an error occurs while processing the API request or parsing the response.
"""
# Define the input data for the API request
data = {
"input": {
"text": technical_publication
}
}
# Add authorization headers for the Scale API
headers = {"Authorization": "Basic clgjomk4t09l71axugxxzgmzq"}
try:
# Make a POST request to the Scale API
response = requests.post(
"https://dashboard.scale.com/spellbook/api/v2/deploy/9e63bii",
json=data,
headers=headers
)
# Extract the output JSON from the response and convert it to a Python dictionary
json_out = eval(response.json()['output'])
# Loop through the dictionary to find the journal name and return it if found
labels = ['title: ', 'journal: ', 'year: ', 'doi: ']
for label, item in zip(labels, json_out):
if label == 'journal: ':
return json_out
# Catch any exceptions that occur during the API request or parsing and raise a new exception with a more informative message
except Exception as e:
raise Exception("Error occurred while processing the API request or parsing the response.") from e
def __find_best_match(self,string1, df, column_name='Title', method='token_sort_ratio'):
"""
Finds the best match for a given string in a pandas dataframe using fuzzy string matching.
Parameters:
-----------
string1 : str
The input string to find a match for.
df : pandas DataFrame
The dataframe containing the strings to match against.
column_name : str, optional (default='Product Description')
The name of the column in the dataframe containing the strings to match against.
method : str, optional (default='token_sort_ratio')
The fuzzy matching algorithm to use. Options include:
- 'ratio': simple ratio
- 'partial_ratio': partial ratio
- 'token_sort_ratio': token sort ratio
- 'token_set_ratio': token set ratio
Returns:
--------
best_match : str
The best matching string in the dataframe.
max_similarity_score : float
The similarity score between the input string and the best matching string.
"""
# initialize variables
best_match = None
max_similarity_score = float('-inf')
# iterate over dataframe rows and compute similarity scores
for _, row in df.iterrows():
string2 = row[column_name]
similarity_score = getattr(fuzz, method)(string1, string2)
# update best match if current similarity score is higher
if similarity_score > max_similarity_score:
max_similarity_score = similarity_score
best_match = string2
return best_match, max_similarity_score
def __insert_values(self, values):
"""
Insert values into the citation table for a given candidate ID and citation ID.
Args:
values (list): A list of values to be inserted into the citation table. The order of values should be as follows:
[cit_id, candidate_id, cit_research_title, cit_journal_title, cit_year_publication_issue_volume, cit_doi]
Returns:
None
Raises:
None
"""
# create a connection to the database
cnx = mysql.connector.connect(user=self.username, password=self.password, host=self.host, database=self.database)
# Define the query with placeholders for the values to insert/update
query = "INSERT INTO citation (cit_peer_reviewed_journals, cit_research_title, cit_journal_title, cit_year_publication_issue_volume, cit_doi, candidate_id, cit_id) VALUES (%s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE cit_peer_reviewed_journals = VALUES(cit_peer_reviewed_journals), cit_research_title = VALUES(cit_research_title), cit_journal_title = VALUES(cit_journal_title), cit_year_publication_issue_volume = VALUES(cit_year_publication_issue_volume), cit_doi = VALUES(cit_doi)"
# execute the query with the values
cursor = cnx.cursor()
cursor.execute(query, (values[6],values[2], values[3], values[4], values[5], values[1], values[0]))
# commit the changes to the database
cnx.commit()
# close the cursor and connection
cursor.close()
# close the database connection
cnx.close()
def technical_publications_score(self):
"""
Calculate technical publications score for each candidate based on the citation data in the
`citation_df` dataframe.
The score is calculated based on the candidate's published journals and their SJR Quartile rank,
where a higher SJR Quartile rank contributes more to the score. The maximum score is capped at 15.
Returns:
pandas.DataFrame: A dataframe containing the technical publications score for each candidate in the
`candidate_df` dataframe. The dataframe has two columns: 'candidate_id' and
'technical_publications_score'.
"""
scores = {}
cit_id = 1
for candidate_id in self.citation_df['candidate_id']:
# Get publications for each candidate
candidate_tech_publications = self.citation_df[self.citation_df['candidate_id']==candidate_id]['cit_peer_reviewed_journals'].values[0]
# Run academic references through parser to fetch journal name
publication_data_list = self.__journal_name_parser(candidate_tech_publications)
candidate_score = 0
# Loop over the publication data list to get all parased publications
for publication in zip(*[iter(publication_data_list)]*4):
# Tuple to list
publication = list(publication)
# Add cit_id, candidate_id, and user input to the payload
publication =[cit_id,candidate_id]+publication+[candidate_tech_publications]
# Upload parsed data to citations table
self.__insert_values(publication)
cit_id+=1
journal_name = publication[3]
# Run through fuzzy_string method to find best match jorunal in journal dataset
best_match_journal, max_similarity_score = self.__find_best_match(journal_name,self.journal_ranks,'Title')
# Match is strong i.e. the journal candidate published in is a valid jorunal
if max_similarity_score > 80:
sjr_quartile_rank = self.journal_ranks[self.journal_ranks['Title']==best_match_journal]['SJR Quartile']
if sjr_quartile_rank.values[0] == 'Q1':
candidate_score+=3
elif sjr_quartile_rank.values[0] == 'Q2':
candidate_score+=2
elif sjr_quartile_rank.values[0] == 'Q3':
candidate_score+=1
elif sjr_quartile_rank.values[0] == 'Q4' or sjr_quartile_rank.values[0] == '-':
candidate_score+=0.5
else:
candidate_score+=0
# Match is weak i.e. the journal candidate published in is not a valid journal
else:
candidate_score+=0
# Cap overall maximum industry score to 5
if candidate_score > 15.0:
scores[candidate_id] = 15.0
else:
scores[candidate_id] = candidate_score
return pd.DataFrame(scores.items(), columns=['candidate_id', 'technical_publications_score'])
def upload_cal_results(self):
"""
Uploads a CSV file containing score calculation results to a MySQL table.
Args:
None.
Returns:
None.
"""
# MySQL database connection
mydb = mysql.connector.connect(
host=self.host,
user=self.username,
password=self.password,
database=self.database
)
# Path to CSV file
csv_file = 'output.csv'
# Read CSV file into a pandas dataframe
df = pd.read_csv(csv_file)
# Check if the table already exists
cursor = mydb.cursor()
cursor.execute("SHOW TABLES LIKE 'score_cal_results'")
result = cursor.fetchone()
if result:
# Table already exists, insert only new data
existing_ids = set(pd.read_sql_query("SELECT candidate_id FROM score_cal_results", mydb)['candidate_id'])
new_data = df[~df['candidate_id'].isin(existing_ids)]
for row in new_data.itertuples():
query = f"INSERT INTO score_cal_results ({', '.join(df.columns)}) VALUES {tuple(row[1:])}"
cursor.execute(query)
else:
# Create a new table using the column names from the CSV file
cols = ", ".join([f"{col} FLOAT" for col in df.columns])
cursor.execute(f"CREATE TABLE score_cal_results ({cols})")
# Insert all data from the CSV file into the new table
for row in df.itertuples():
query = f"INSERT INTO score_cal_results ({', '.join(df.columns)}) VALUES {tuple(row[1:])}"
cursor.execute(query)
# Commit changes and close database connection
mydb.commit()
mydb.close()