stata


*There are four ways to include comments in a do-file.
*1. Begin the line with a ‘*’; Stata ignores such lines. * cannot be used within Mata.
*2. Place the comment in /* */ delimiters.
*3. Place the comment after two forward slashes, that is, //. Everything after the // to the end of the current line is considered a comment (unless the // is part of http://. . . ).
*4. Place the comment after three forward slashes, that is, ///. Everything after the /// to the end of the current line is considered a comment. However, when you use ///, the next line joins with the current line. /// lets you split long lines across multiple lines in the do-file.

*a sample
/*
clear
set more off
mkdir d:/chinafin
cd d:/chinafin
cnstock all
levelsof stkcd,local(stkcd)
foreach c of local stkcd{
	chinafin `c'
	}

*/

/*

*******************************************************************************
*General Do-File Header*
clear all			//clear memory
capture log close	//close any log-files (if still running)
*version 15.0		//your current version, change this to your version (and remove asterisk)!
set more off		//do not halt to show all output
*******************************************************************************

*cd "path"		//enter the path to your working directory and uncomment this line

sysuse nlsw88, clear		//open the dataset
*set scheme plotplain 		//Uncomment if you want to use the same style as in the book


**********************************************************************
*Creating Tables
**********************************************************************

***Model 1***
regress wage c.ttl_exp		//Only include work experience
estimates store M1		//Save results

***Model 2***
regress wage c.ttl_exp i.union i.south	//Add binary variables
estimates store M2			//Save results

***Model 3***
regress wage c.ttl_exp i.union i.south i.race	//Add ordinal variable
estimates store M3					//Save results

*Produce output*
estimates table M1 M2 M3, se stats(r2 N)

*Change number format*
estimates table M1 M2 M3, se stats(r2 N) b(%7.3f) se(%7.2f)


***Using the estout-Ado***

ssc install estout, replace			//Install ado

eststo M1: regress wage c.ttl_exp
eststo M2: regress wage c.ttl_exp i.union i.south
eststo M3: regress wage c.ttl_exp i.union i.south i.race

esttab M1 M2 M3 using "new_table.rtf"	//Output table in current working directory

esttab M1 M2 M3 using "new_table.rtf",  nogaps nomtitles r2 ///
	star(# 0.10 * 0.05 ** 0.01 *** 0.001) b(3) se label replace

**********************************************************************
*Creating Graphs
**********************************************************************

ssc install coefplot, replace				//Install ado
quietly regress wage c.ttl_exp i.union i.south i.race
coefplot, xline(0)					//Create vertical line at zero



quietly regress wage c.ttl_exp i.union i.south i.race
margins				//Show average wage when all variables are at the mean
margins, atmeans		//Show the means for all variables

margins, at(ttl_exp=(0(5)30))	//Compute predicted values for certain values of job experience
marginsplot

margins union, at(ttl_exp=(0(5)30))		//Furthermore differentiate by union status
marginsplot

margins union, at(ttl_exp=(0(5)30) south=(0 1))	//Subgraphs by union status
marginsplot
marginsplot, by(south)



*Including an interaction between job experience and itself to account
*for higher ordered terms
regress wage c.ttl_exp##c.ttl_exp i.union i.south i.race		//Run new regression model
margins union, at(ttl_exp=(0(5)30) south=(0 1))
marginsplot, by(union)
*/


************ another sample *******************************

/*

*******************************************************************************
*General Do-File Header*
clear all			//clear memory
capture log close	//close any log-files (if still running)
*version 15.0		//your current version, change this to your version (and remove asterisk)!
set more off		//do not halt to show all output
*******************************************************************************



***Setting your current working directory***
cd "C:/Users/username/stata-course/"

/*You must replace this generic example with the path to the folder
where your files are saved*/


***Showing current working directory***
pwd


***Show all files in the current folder***
dir
ls			//Alternative command


*use "filename.dta"			//Open desired Stata file


***Alternative: absolute folder paths***
*use "C:\Users\username\stata-course\filename.dta"


***Entering data manually***
clear all					//Start with a fresh memory
set obs 10					//10 observations (cases)
generate age = .
generate income = .
generate gender = .
edit						//Edit data
browse						//Inspect data


***Using preinstalled data***
sysuse auto, clear			//Open preinstalled file
describe					//Inspect data


***Exporting data to (open) file formats***
outsheet using "auto_export.csv"	//CSV - Comma Separated Values
export excel using "etest.xls"		//XLS - Microsoft Excel File Format


***Delimit and line breaks***
sysuse nlsw88, clear
#delimit ;				//Tell Stata that all commands end with a semicolon from here on
count if age < 40 &
	smsa == 1 &
	wage > 20 &
	grade >= 7;			//Finally end your long command with the semicolon
#delimit cr				//Restore standard settings

*/




**************************** one more sample ********************

/*
*******************************************************************************
*General Do-File Header*
clear all			//clear memory
capture log close	//close any log-files (if still running)
*version 15.0		//your current version, change this to your version (and remove asterisk)!
set more off		//do not halt to show all output
*******************************************************************************


*cd "path"		//enter the path to your working directory and uncomment this line

***Getting to know your data***
sysuse nlsw88		//open the dataset
describe			//basic information about the dataset


*Looking at special cases (Listing cases)
list if age < 40 & south == 1 & union == 1				//List all cases that satisfy the condition after "if"
list in 1/10											//List all information about first 10 cases
list idcode age grade union wage in 1/10 if age < 40	//Combine in and if


***Variable names and labels***
rename smsa metro			//renaming variable from "smsa" to "metro"
label variable metro "Standard Metropolitan Statistical Area"  //changing label
tabulate metro			//Inspect results


***Labeling values***
tabulate c_city						//inspect the variable c_city
label define yesno 1 "yes" 0 "no" 	//creating a new label
label values c_city yesno			//applying label to variable
tabulate c_city						//Inspect results


***IDs and unique identifiers***
isid idcode				//check if variable "idcode" is a unique identifier
generate ID = _n		// generate a new ID variable
label variable ID "new unique identifier"		//label new ID variable
duplicates list			//check for duplicates in the dataset

*IDs by Industry*
bysort industry: gen ind_ID = _n
/*This command sorts people by the industry they work in and creates a new ID
that identifies people within on industy group*/
order industry ind_ID			//Reorder columns for easier inspection
browse					//Inspect results
list ID ind_ID industry in 1/30, sepby(industry)					

*Bysort*
bysort industry (age): gen ind_age_ID = _n
/*This command sorts all people by industry and age and then creates a counter. By doing this,
the people within one industry group are further sorted by age. Compare what changes if you
omit the parentheses around age:*/
list ind_age_ID age industry in 1/30, sepby(industry)

bysort industry age: gen ind_age_ID2 = _n			//Not only order by age, also separate groups
order industry ind_ID ind_age_ID ind_age_ID2 age	//Reorder columns for easier inspection
browse												//Inspect results		
list ind_age_ID2 age industry in 1/30, sepby(industry)

***Sorting***
sort age
list ID age in 1/100		//Sort ascending
gsort -age
list ID age in 1/100		//Sort descending

***Missing values***
sysuse nlsw88, clear		//Undo changes, start fresh
misstable summarize, all	//receive an overview about missing values in the dataset

/* Now we want to count how many people are working more than 60 hours per week.
As we have missing values (depicted by dots) in this variable and missing values are
regarded as extremely large numbers by Stata, the following command will get us a wrong
result*/
count if hours > 60							//Stata counts 22 people, which is incorrect as 4 have missings!
count if hours > 60 & hours < .				//Correct result (18 people)
count if hours > 60 & !missing(hours)		//Alternative command


***Creating new variables***
generate ybirth = 1988 - age				//Generating year of birth of a person
label variable ybirth "Year of birth"		//Labeling variable
tabulate ybirth								//Inspecting results

generate age_squared = age^2				//Generating the squared version of age
*generate age_squared = age*age				//Alternative command
label variable age_squared "Aqe Squared"	//Labeling variable
tabulate age_squared						//Inspect results


***Special Functions***
*Inlist*
count if occupation == 1 | occupation == 2 | ///
	occupation == 3 | occupation == 4
count if inlist(occupation,1,2,3,4)				//Same result, shorter command


*Inlist, second usage*
count if inlist(1,union,smsa,c_city)
count if union == 1 | smsa == 1 | c_city == 1		//Alternative command



*Inrange*
count if wage >= 10 & wage <= 15
count if inrange(wage,10,15)					//Same result, shorter command

*Irecode*
generate hours_cat = .
replace hours_cat = 0 if hours <= 20
replace hours_cat = 1 if hours > 20 & hours <= 40
replace hours_cat = 2 if hours > 40 & hours !=.
tabulate hours_cat
drop hours_cat

generate hours_cat = irecode(hours,20,40)	//Same result, shorter command
tabulate hours_cat

*Autocode*
generate tenure_cat = autocode(tenure,5,0,27)
tabulate tenure_cat
tabulate tenure tenure_cat					//Crosscheck

*Egen*
egen maxvalue = rowmax(wage tenure hours)		//Find Maximum of all three variables for each case
list idcode maxvalue wage tenure hours in 1/20	//Inspect result


***The if qualifier***
count					//Shows number of observations in the dataset

/*Make sure that the variables used have no missing values as otherwise the results
might be wrong! This is important when you work with the following operators:
>
<
>=
<=
You can either delete all missings (drop if missing(age) == 1) or exclude these cases
as shown in the following examples.
*/

*Show number of people that are older or equal to 40 and are married:
count if age >= 40 & !missing(age) & married == 1

/*Show number of people that work in the mining industry or the construction industry
and are not white*/
count if (industry == 2 | industry == 3) & race != 1 & !missing(race)

/*Note that parentheses make a difference here! If you switch them, your result will differ*/
count if industry == 2 | industry == 3 & race != 1 & !missing(race)

*Show the number of people that are younger than 35 and earn 25 or more:
count if age < 35 & wage >= 25 & !missing(age) & !missing(wage)


***Changing and replacing variables***
generate parttime = .									//Create new variable with missing values
replace parttime = 1 if hours <= 20						//Replace people who work 20 hours or less with value 1
replace parttime = 0 if hours > 20 & !missing(hours)	//Replace people who work more than 20 hours with value 0
tabulate parttime, missing								//Inspect results
tabulate hours parttime, missing						//Crosscheck results
label variable parttime "Working Part-time"				//Labeling variable
label values parttime yesno								//Label values with label "yesno" created above
tabulate parttime, missing

***Assert***
assert missing(hours) == missing(parttime)

*New variables and labels in one step
codebook race						//Check how numerical values are coded to labels
recode race (1 = 1 "yes") (2 3 = 0 "no"), gen(is_white)	//Create and label variable
tabulate race is_white					//Validate results


*Saving our results*
save "nlsw88_new.dta", replace		//Save results and overwrite existing dataset


***Removing Observations and Variables***
*(We wont save these changes to the file)
drop if grade < 4			//Remove people with less than 4 years of schooling
keep if occupation == 2		//Remove all persons that are NOT managers (just keep the managers)

drop parttime			//remove variable which we created before
describe		//test if command was successful

***Cleaning data systematically***
assert inrange(age,18,100)			//Age must be between 18 and 100 years
assert inrange(occupation,1,13)		//Gives error as some cases have missing values
assert inrange(occupation,1,13) | missing(occupation)


compare ttl_exp tenure

*******************************************************************************
*Combining datasets*
*******************************************************************************

/*If you are not using Stata 15 or 14, make sure to add the suffix
_old to any dataset here. So for example, instead of typing
append_a.dta use append_a_old.dta*/



******************************************
*1. Appending Datasets
******************************************
use "http://data.statabook.com/append_a.dta", clear		//Load dataset A
list													//Inspect dataset A
save "append_a", replace								//Save dataset A
use "http://data.statabook.com/append_b.dta", clear		//Load dataset B
list													//Inspect dataset B
save "append_b", replace								//Save dataset B
append using "append_a", generate(check_append)			//Merge and create testing variable
list													//Inspect results


******************************************
*2. One-to-One Merge
******************************************
use "http://data.statabook.com/1to1_a.dta", clear		//Load dataset A
list													//Inspect dataset A
save "1to1_a", replace									//Save dataset A
use "http://data.statabook.com/1to1_b.dta", clear		//Load dataset B
list													//Inspect dataset B
save "1to1_b", replace									//Save dataset B
merge 1:1 country using "1to1_a.dta"					//Merge A and B
list													//Inspect results


******************************************
*3. Many-to-One Merge
******************************************
use "http://data.statabook.com/mto1_b.dta", clear		//Load dataset B
list													//Inspect dataset B
save "mto1_b", replace									//Save dataset B
use "http://data.statabook.com/mto1_a.dta", clear		//Load dataset A
list													//Inspect dataset A
save "mto1_a", replace									//Save dataset A
merge m:1 school_ID using "mto1_b.dta"					//Merge A and B
list													//Inspect results


******************************************
*4. One-to-Many Merge
******************************************
/*This is basically the merge procedure from Many-to-One, yet the master file
(the file that is open in memory at the time of the merge) and the using file
(the file saved on your harddrive) is swapped. You can see this easily by
comparing the command from 3. and 4. Whenever you can perform one of the two merges,
you also can perform the other one. Just pay attention to which file has "many"
cases and which has only "one". If you find this hard to remember, try to think
of the "pupils and schools" example, as this is clear and logical, as there will always
be more pupils than schools. */


use "http://data.statabook.com/mto1_a.dta", clear		//Load dataset A
list													//Inspect dataset A
save "mto1_a", replace									//Save dataset A
use "http://data.statabook.com/mto1_b.dta", clear		//Load dataset B
list													//Inspect dataset B
save "mto1_b", replace									//Save dataset B
merge 1:m school_ID using "mto1_a.dta"					//Merge A and B
list													//Inspect results


******************************************
*5. Many-to-Many Merge
******************************************
/* Do not use this. Ever. To quote the Stata documentation:
"This is allowed for completeness, but it is difficult to imagine
an example of when it would be useful. (...) Use of merge m:m
is not encouraged."
https://www.stata.com/manuals13/dmerge.pdf (2018-01-18).
*/

******************************************
*6. All pairwise combinations (joinby)
******************************************

use "http://data.statabook.com/pairs_a.dta", clear		//Parent data
list
save "pairs_a", replace
use "http://data.statabook.com/pairs_b.dta", clear		//Child data
list
save "pairs_b", replace
joinby family_ID using "pairs_a"						//Form pairs
sort family_ID parent_ID child_ID 						//Sort data
order family_ID parent_ID age_P child_ID score			//Reorder variables
list, sepby(family_ID) 									//List results

*The sebpy option just inserts lines between families which makes
*the list more clear.




******************************************
*Reshaping Data
******************************************

use "http://data.statabook.com/reshape.dta", clear
list
reshape long score@, i(country) j(year) 				//Wide to long
list, sepby(country)									//List results

/* The data is in wide format as every observation takes exactly
one line. Now we reshape, specifying the ID ("id") which is
unique for each case. Then we specify a time variable that
Stata should create for us ("year"). "Long" tells Stata that we 
want to reshape into long format. We also tell Stata which
variables are not time constant. The @-sign tells Stata where
to look for the time information */

*We can go back by using the same command and just change "long" to "wide"

reshape wide score@, i(country) j(year)	//Long to wide
list


*Also shorter commands possible as no new variables are created*
reshape long
reshape wide

*/





/*first week 20230225*/

*******************************************************************************
*General Do-File Header*
clear all			//clear memory
capture log d:/stata_2	//close any log-files (if still running)
*version 15.0		//your current version, change this to your version (and remove asterisk)!
set more off		//do not halt to show all output
*******************************************************************************

***Setting your current working directory***
cd "C:/Users/username/stata-course/"

/*You must replace this generic example with the path to the folder
where your files are saved*/


***Showing current working directory***
pwd


***Show all files in the current folder***
dir


*use "filename.dta"			//Open desired Stata file


***Alternative: absolute folder paths***
*use "C:\Users\username\stata-course\filename.dta"


***Entering data manually***
clear all					//Start with a fresh memory



*useful statasource  FTOOLS: A faster Stata for large datasets https://github.com/sergiocorreia/ftools/raw/master/src/
cap pr drop UninstallAndInstall
program UninstallAndInstall
	args prog
	cap ado uninstall `prog'

	if ("`prog'"=="ftools") {
		loc cmd net install ftools, from("https://github.com/sergiocorreia/ftools/raw/master/src/")
	}
	else if ("`prog'"=="reghdfe") {
		loc cmd net install reghdfe, from("https://github.com/sergiocorreia/reghdfe/raw/master/src/")
	}
	else {
		loc cmd ssc install `prog'
	}
	di as text `"`cmd'"'
	`cmd'

	// Not strictly necessary as it gets compiled later when called by the user
	if inlist("`prog'", "ftools", "reghdfe") {
		`prog', compile
	}
end

adopath

* Merged into ftools
cap ado uninstall moresyntax

loc programs moremata boottest ivreg2 ftools reghdfe estout egenmore binscatter

foreach prog of loc programs {
	UninstallAndInstall `prog'
	di as text "----" _n _n
}

https://github.com/sergiocorreia/ivreghdfe

IVREGHDFE: reghdfe + ivreg2 (adds instrumental variable and additional robust SE estimators to reghdfe)

version 13 // currently needs 13 or 14

* Install requirements

	* ftools
	cap ado uninstall ftools
	net install ftools, from("C:/Git/ftools/src")

	* reghdfe
	cap ado uninstall reghdfe
	net install reghdfe , from("C:/Git/reghdfe/src/")

* Install ivreghdfe
	cap ado uninstall ivreghdfe
	net install ivreghdfe, from("C:/Git/ivreghdfe/src/")
	//net install ivreghdfe, from(c:\git\ivreg2_demo)

* Setup
	clear all
	discard
	pr drop _all
	cls
	sysuse auto
	replace turn = . in 1 // ensure we detect MVs in absorb()

* Sanity checks
	ivreg2 price weight // ensure ivreg2 works
	ivreg2 price weight i.turn , partial(i.turn) small // equivalent to the HDFE version
	ivreghdfe price weight // ensure ivreghdfe works
	ivreghdfe // ensure replay() works

* Test absorb()
	ivreghdfe price weight, absorb(turn) resid
	ivreghdfe // ensure replay() works

	* Benchmark
	reghdfe price weight, a(turn) keepsingletons


* Advanced: TWFE
	ivreghdfe price weight, absorb(turn trunk)
	reghdfe price weight, absorb(turn trunk) keepsing

* Cluster
	ivreghdfe price weight, absorb(turn) cluster(turn)
	reghdfe price weight, absorb(turn) vce(cluster turn) keepsing

* TWC and TWFE
	ivreghdfe price weight, absorb(turn foreign) cluster(turn trunk)
	reghdfe price weight, absorb(turn foreign) cluster(turn trunk) keepsing

exit

clear all
discard
sysuse auto

tempvar touse newtouse
gen `touse' = foreign

loc absorb turn trunk##c.gear, lorem(ipsum) // todo: allow extra options such as tol() here
loc weight_type 
loc weight_var
loc drop_singletons = 0
loc verbose = 1
loc clustervars turn#trunk


// If we load the data by hand, we need
// ms_fvstrip `vars' if `touse', expand dropomit addbn onebyone

mata:
	HDFE = fixed_effects("`absorb'", "`touse'", "`weight_type'", "`weight_var'", `drop_singletons', `verbose')
	HDFE.G // number of absvars
	HDFE.tolerance = 1e-10 // change tolerance (later we can pass these options through absorb)
	HDFE.N // number of obs.
	// We need to know clustervars to compute the absorbed degrees-of-freedom
	HDFE.options.clustervars = tokens("`clustervars'") // todo: rely on ms_parse_vce.ado
	HDFE.options.base_clustervars = tokens(subinstr("`clustervars'", "#", " ")) // todo: rely on ms_parse_vce.ado
	HDFE.options.num_clusters = length(HDFE.options.clustervars)
	
	HDFE.estimate_dof() // compute degrees-of-freedom
	HDFE.output.df_a // e(df_a)
	// Optionally update touse (might have been reduced if we dropped singletons)
	HDFE.save_touse("`newtouse'")
	y = HDFE.partial_out("price") // first option
	x = st_data(HDFE.sample, "weight length") // load data
	HDFE._partial_out(x) // second option
	beta = invsym(cross(x, x)) * cross(x, y)
	beta
	// Cleanup
	mata drop HDFE
end


* Non-Mata alternative:
reghdfe price weight length if foreign, a(turn trunk##c.gear) vce(cluster turn#trunk)


exit

clear all
set more off
timer clear

* Warm-up
sysuse auto
reg price weight
areg price weight, a(turn)
reghdfe price weight, a(turn)
ivreg2 price weight
ivreghdfe price weight, a(turn)

* Benchmark
clear
set obs 10000000
gen double y = runiform()
gen double x1 = runiform()
gen double x2 = runiform()
gen double x3 = runiform()
gen double x4 = runiform()
gen int cl = floor(runiform()*100)
gen int id = floor(runiform()*10000)
gen byte c = 1
cls

* No FEs
timer on 1
reg y x*, cluster(cl)
timer off 1

timer on 2
reghdfe y x*, cluster(cl) noa keepsing
timer off 2

timer on 3
ivreg2 y x*, cluster(cl) small
timer off 3

timer on 4
ivreghdfe y x*, cluster(cl) a(c)
timer off 4

* One set of FEs
timer on 5
areg y x*, cluster(cl) a(id)
timer off 5

timer on 6
reghdfe y x*, cluster(cl) a(id) keepsing
timer off 6

timer on 7
ivreghdfe y x*, cluster(cl) a(id)
timer off 7


timer list
exit

noi cscript "ivreg2 with absorb()" adofile reghdfe

* Setup
	sysuse auto
	replace turn = . in 1 // ensure we detect MVs in absorb()


* Test 1: ivreg2==ivreghdfe

	ivreg2 price weight (gear=length)
	storedresults save benchmark e()

	ivreghdfe price weight (gear=length)
	storedresults compare benchmark e(), exclude(macro: cmd cmdline ivreg2cmd)
	storedresults drop benchmark


* Test 2: ivreghdfe==ivreg+partial
	ivreg2 price weight i.turn , partial(i.turn) small
	**predict double xb1, xb // xb not supported with partial() (and ,resid crashed)
	storedresults save benchmark e()

	ivreghdfe price weight, absorb(turn) keepsingletons
	assert e(N) == 73
	assert e(df_m)==1
	loc excluded ///
		macro: cmd cmdline ivreg2cmd insts inexog partial partial1 partialcons df_m predict ///
		scalar: partialcons df_m
	storedresults compare benchmark e(), exclude(`excluded') tol(1e-12)
	storedresults drop benchmark
	
	**predict double xb2, xb
	**_vassert xb1 xb2, tol(1e-8)
	**drop xb1 xb2


* Test 3: ivreghdfe==reghdfe
	reghdfe price weight, absorb(turn) keepsingletons nocons
	loc bench_r2 = e(r2_within)
	storedresults save benchmark e()

	ivreghdfe price weight, absorb(turn) keepsingletons
	assert e(rank)==.
	assert e(df_m)==1
	assert abs(e(r2) - `bench_r2') < 1e-8
	loc excluded ///
		macro: cmd cmdline vce indepvars title title2 footnote estat_cmd predict marginsnotok ///
		scalar: rank ic N_hdfe_extended redundant tss tss_within mss ll_0 r2_a_within sumweights ///
			r2_a r2_within r2 report_constant converged
	storedresults compare benchmark e(), tol(1e-12) exclude(`excluded') 
	storedresults drop benchmark
	// why does mss differs??


* Test 4: ivreghdfe==reghdfe with TWFE and TWC
* PART 1) Variance matches reghdfe v3
	reghdfe price weight, absorb(turn foreign) cluster(turn trunk) keepsingletons version(3)
	loc bench_r2 = e(r2_within)
	storedresults save benchmark e()

	ivreghdfe price weight, absorb(turn foreign) cluster(turn trunk) keepsingletons
	assert e(rank)==.
	assert e(df_m)==1
	assert abs(e(r2) - `bench_r2') < 1e-8
	loc included scalar: N matrix: b V
	storedresults compare benchmark e(), tol(1e-12) include(`included') 
	storedresults drop benchmark
	// why does mss and rmse differ?

* Test 4: ivreghdfe==reghdfe with TWFE and TWC
* PART 2) All else but variance match reghdfe v5 or v6
	reghdfe price weight, absorb(turn foreign) cluster(turn trunk) keepsingletons nocons
	loc bench_r2 = e(r2_within)
	storedresults save benchmark e()

	ivreghdfe price weight, absorb(turn foreign) cluster(turn trunk) keepsingletons
	assert e(rank)==.
	assert e(df_m)==1
	assert abs(e(r2) - `bench_r2') < 1e-8
	loc excluded ///
		macro: cmd cmdline vce indepvars title title2 footnote estat_cmd predict title3 marginsnotok ///
		scalar: rank ic N_hdfe_extended redundant tss tss_within mss ll_0 r2_a_within sumweights ///
			r2_a r2_within r2 rmse N_clustervars report_constant F converged /// 
		matrix: V
	storedresults compare benchmark e(), tol(1e-12) exclude(`excluded') 
	storedresults drop benchmark
	// why does mss and rmse differ?
	* BUGBUG: maybe tweak the small sample adjustment to match v5 and v6??


* Test 4b: as 4 but drop singletons
	reghdfe price weight, absorb(turn foreign) cluster(turn trunk) nocons
	loc bench_r2 = e(r2_within)
	storedresults save benchmark e()

	ivreghdfe price weight, absorb(turn foreign) cluster(turn trunk)
	assert e(rank)==.
	assert e(df_m)==1
	assert abs(e(r2) - `bench_r2') < 1e-8
	loc excluded ///
		macro: cmd cmdline vce indepvars title title2 footnote estat_cmd predict title3 marginsnotok ///
		scalar: rank ic N_hdfe_extended redundant tss tss_within mss ll_0 r2_a_within sumweights ///
			r2_a r2_within r2 rmse N_clustervars report_constant F converged matrix: V
	storedresults compare benchmark e(), tol(1e-12) exclude(`excluded') 
	storedresults drop benchmark
	// why does mss and rmse differ?



* Test 5: ivreghdfe with IV
	reghdfe price weight (gear=length), absorb(turn) cluster(trunk) keepsingletons version(3)
	loc bench_r2 = e(r2_within)
	storedresults save benchmark e()

	ivreghdfe price weight (gear=length), absorb(turn) cluster(trunk) keepsing
	assert e(rank)==.
	assert e(df_m)==2
	assert abs(e(r2) - `bench_r2') < 1e-8
	* Have to add a bunch more because I' comparing against the old reghdfe
	loc excluded ///
		macro: cmd cmdline vce indepvars title title2 footnote estat_cmd predict title3 ///
			instruments endogvars vcesuite dofadjustments subcmd ivreg2cmd marginsok marginsnotok ///
		scalar: rank ic N_hdfe_extended redundant tss tss_within mss ll_0 r2_a_within sumweights ///
			r2_a r2_within r2 rmse N_clustervars partial_ct df_m savestages r2u r2c G1 M1_nested ///
			M1_exact K1 M1 unclustered_df_r partialcons M_due_to_nested mobility report_constant converged
	storedresults compare benchmark e(), tol(1e-12) exclude(`excluded') 
	storedresults drop benchmark
	// why does mss and rmse differ?

exit

clear all
cls
set more off


// Create fake dataset with two identifiers
set obs 100000
gen long id1 = floor(runiform()*100)
gen long id2 = floor(runiform()*100)
su id*


mata:
mata set matastrict on
timer_clear()
vars = st_data(., .)
// Builtin asociative array
transmorphic scalar counter1(real matrix vars)
{
	real i, n, cols, dict_size, val
	real vector dict
	transmorphic matrix keys
	transmorphic rowvector key
	transmorphic scalar A
	
	cols = cols(vars)
	n = rows(vars)
	dict_size = 100000
	A = asarray_create("real", cols, dict_size)
	asarray_notfound(A, 0)
	// Fill asarray
	for (i=1; i<=n; i++) {
		val = asarray(A, vars[i, .])
		asarray(A, vars[i, .], val + 1)
	}
	return(A)
}
// Alternative asociative array; use linear probing
// (or quadratic probing, or any other open addressing strat)
void counter2(real matrix vars, real matrix dict, real matrix keys)
{
	real i, h, n, cols, dict_size, val
	transmorphic rowvector key
	cols = cols(vars)
	n = rows(vars)
	dict_size = 100000
	dict = J(dict_size, 1, 0)
	keys = J(dict_size, cols, .)
	for (i=1; i<=n; i++) {
		key = vars[i, .]
		h = hash1(key, dict_size)
		val = dict[h]
		
		// key not found before
		if (val == 0) {
			dict[h] = 1
			keys[h, .] = key
		}
		// key was found, no collision
		else if (key == keys[h, .]) {
			dict[h] = dict[h] + 1
		}
		// collision; increment pointer h until empty slot found
		else {
			do {
				++h
				if (h > dict_size) {
					h = 1
				}
				val = dict[h]
				
				if (val == 0) {
					dict[h] = 1
					keys[h, .] = key
					break
				}
				if (key == keys[h, .]) {
					dict[h] = dict[h] + 1
					break
				}
			} while (1)
		}
	}
}
real scalar get1(transmorphic scalar A, transmorphic matrix x)
{
	return(asarray(A, x))
}
real scalar get2(real vector dict, transmorphic matrix keys, transmorphic matrix x)
{
	real h, val, dict_size
	dict_size = 100000
	h = hash1(x, dict_size)
	do {
		val = dict[h]
		if (val==0) {
			return(0)
			break
		}
		else if (keys[h, .] == x) {
			return(val)
			break
		}
		else {
			++h
		}
	} while (1)
}
// Builtin
timer_on(1)
c1 = counter1(vars)
timer_off(1)
// Alternative
timer_on(2)
dict = keys = .
c2 = counter2(vars, dict, keys)
timer_off(2)
// Check that they give the same values
x = (1,1)
get1(c1, x)
get2(dict, keys, x)
timer()
// Done
end


发表回复

您的电子邮箱地址不会被公开。 必填项已用 * 标注