// 02_DoddFrank_Act/ 

//Directory for Windows user:
//cd %systemdrive%\Users\%username%\Downloads\Codes\02_DoddFrank_Act\
//Directory for MacOS user:
//cd ~/Downloads/Codes/02_DoddFrank_Act/

forval i=1/16 {
insheet using Source_datasets/DFA-titles_processed/cons-count_title_`i'.csv, delimiter(";") clear
	drop v1
	rename v2 key
	rename v3 category
	rename v4 count

	sort key count

	egen id_key = group(key)
	egen id_category = group(category)

	gen _one = 1

	bysort id_category: egen category_count = sum(count)
	bysort id_category: egen category_unique_count = sum(_one)

	bysort id_key: egen total_count = sum(count)
drop count _one

	bysort id_key: gen _duplicate = _N
	gsort - _duplicate + id_key

	keep category category_count category_unique_count
duplicates drop

	gsort category
	gen title = `i'
outsheet using Source_datasets/DFA-titles_processed/category_cons_count_title_`i'.csv, delimiter(";") replace	
save Source_datasets/DFA-titles_processed/category_cons_count_title_`i'.dta, replace
}

// join them together
use Source_datasets/DFA-titles_processed/category_cons_count_title_1.dta, clear
forval i=2/16 {
	append using Source_datasets/DFA-titles_processed/category_cons_count_title_`i'.dta
}
outsheet using Source_datasets/DFA-titles_processed/category_cons_count_all_titles.csv, replace

//
// COMPUTE MOST OCCURRING WORDS
//

forval i=1/16 {
insheet using Source_datasets/DFA-titles_processed/cons-count_title_`i'.csv, delimiter(";") clear
	drop v1
	rename v2 key
	rename v3 category
	rename v4 count

	save ./Source_datasets/DFA-titles_processed/cons-count_title_`i'.dta, replace
}

use ./Source_datasets/DFA-titles_processed/cons-count_title_1.dta, clear
forval i=2/16 {
append using ./Source_datasets/DFA-titles_processed/cons-count_title_`i'.dta
}

	bysort key: egen total_count = sum(count)
	keep key category total_count

duplicates drop
	gsort + category - total_count
outsheet using ./Source_datasets/DFA-titles_processed/category_cons_all_titles_most_frequent_keys.csv, replace

	gen one = 1
	bysort category: egen unique_count = total(one)

	keep category unique_count 
	duplicates drop
	
outsheet using ./Source_datasets/DFA-titles_processed/category_unique_count.csv, replace


forval i=1/16 {
erase ./Source_datasets/DFA-titles_processed/category_cons_count_title_`i'.dta
}

