students.csv into SAS and create a dataset named students_data. Display the first 10 observations.
▶
/* Step 1: Import CSV file into SAS dataset */ PROC IMPORT DATAFILE="students.csv" OUT=students_data DBMS=CSV REPLACE; GETNAMES=YES; RUN; /* Step 2: Display first 10 observations */ PROC PRINT DATA=students_data (OBS=10); RUN;
PROC IMPORT — Reads the CSV file and creates dataset students_dataDBMS=CSV — Specifies the file type as CSVGETNAMES=YES — Uses the first row as variable namesPROC PRINT (OBS=10) — Displays only the first 10 rowsemployee_data using DATALINES with variables Emp_ID, Name, Department, Salary. Enter at least 5 records and display the dataset.
▶
/* Step 1: Create dataset using DATALINES */ DATA employee_data; INPUT Emp_ID Name $ Department $ Salary; DATALINES; 101 Amit HR 30000 102 Neha IT 45000 103 Raj Sales 35000 104 Priya Finance 40000 105 Karan IT 50000 ; RUN; /* Step 2: Display the dataset */ PROC PRINT DATA=employee_data; RUN;
DATA employee_data; — Creates a new datasetINPUT — Defines variables ($ for character variables like Name & Department)DATALINES — Used to manually enter data inlinePROC PRINT — Displays the datasetsales_data using appropriate SAS procedure.
▶
/* Display structure and metadata of dataset */ PROC CONTENTS DATA=sales_data; RUN;
PROC CONTENTS — Shows full structure of the datasetPROC CONTENTS DATA=sales_data SHORT;student_marks, display only students whose Marks are greater than 60.
▶
/* Display students with Marks > 60 */ PROC PRINT DATA=student_marks; WHERE Marks > 60; RUN;
WHERE Marks > 60; — Filters dataset to show only students scoring above 60PROC PRINT — Displays the filtered recordsDATA filtered_students; SET student_marks; IF Marks > 60; RUN; PROC PRINT DATA=filtered_students; RUN;
customer_data by Customer_ID and display the sorted dataset.
▶
/* Step 1: Sort dataset */ PROC SORT DATA=customer_data OUT=sorted_customer_data; BY Customer_ID; RUN; /* Step 2: Display sorted dataset */ PROC PRINT DATA=sorted_customer_data; RUN;
PROC SORT — Sorts the datasetBY Customer_ID; — Sorts in ascending order by Customer_IDOUT=sorted_customer_data — Stores result in a new datasetBY DESCENDING Customer_ID;customer_data based on Customer_ID.
▶
/* Remove duplicate records */ PROC SORT DATA=customer_data OUT=unique_customer_data NODUPKEY; BY Customer_ID; RUN; PROC PRINT DATA=unique_customer_data; RUN;
NODUPKEY — Removes duplicate records based on the BY variableBY Customer_ID; — Keeps only the first occurrence of each Customer_IDOUT=unique_customer_data — Stores the deduplicated resultstudent_info and student_marks using Student_ID and display the merged dataset.
▶
/* Step 1: Sort both datasets by Student_ID */ PROC SORT DATA=student_info; BY Student_ID; RUN; PROC SORT DATA=student_marks; BY Student_ID; RUN; /* Step 2: Merge datasets */ DATA merged_data; MERGE student_info student_marks; BY Student_ID; RUN; /* Step 3: Display merged dataset */ PROC PRINT DATA=merged_data; RUN;
MERGE — Combines datasets horizontallyBY Student_ID; — Matches records based on Student_IDclassA and classB into a single dataset using SET.
▶
/* Combine datasets (vertical stack) */ DATA combined_class; SET classA classB; RUN; PROC PRINT DATA=combined_class; RUN;
SET classA classB; — Appends (stacks) classB below classA verticallyDATA combined_class; — Creates a new combined datasetemployee_data, keep only variables Emp_ID, Name, Salary and display the dataset.
▶
/* Keep only selected variables */ DATA filtered_employee; SET employee_data; KEEP Emp_ID Name Salary; RUN; PROC PRINT DATA=filtered_employee; RUN;
KEEP Emp_ID Name Salary; — Retains only these columnsSalary to Monthly_Salary in dataset employee_data.
▶
/* Rename variable */ DATA employee_data_renamed; SET employee_data; RENAME Salary = Monthly_Salary; RUN; PROC PRINT DATA=employee_data_renamed; RUN;
RENAME Salary = Monthly_Salary; — Changes the variable nameMarks from dataset exam_data and replace them with 0.
▶
/* Replace missing values with 0 */ DATA updated_exam_data; SET exam_data; IF Marks = . THEN Marks = 0; RUN; PROC PRINT DATA=updated_exam_data; RUN;
Marks = . — Represents missing numeric values in SAS (dot notation)IF Marks = . THEN Marks = 0; — Replaces missing values with 0customer_details containing variable Full_Name, extract the first name using SAS functions.
▶
DATA customer_firstname; SET customer_details; First_Name = SCAN(Full_Name, 1); RUN; PROC PRINT DATA=customer_firstname; RUN;
SCAN(Full_Name, 1) — Extracts the first word (first name) from the Full_Name string/* Extract first three characters */ DATA new_data; SET your_dataset; First_Three = SUBSTR(text_variable, 1, 3); RUN; PROC PRINT DATA=new_data; RUN;
SUBSTR(text_variable, 1, 3) — Extracts 3 characters starting from position 1First_Three — New variable storing the extracted result/* Remove spaces from character variable */ DATA new_data; SET your_dataset; Clean_Text = COMPRESS(text_variable, ' '); RUN; PROC PRINT DATA=new_data; RUN;
COMPRESS(text_variable, ' ') — Removes all spaces from the variableClean_Text — New variable without spacesmonthly_sales from long format to wide format using PROC TRANSPOSE.
▶
/* Sort first */ PROC SORT DATA=monthly_sales; BY ID; RUN; /* Convert long format to wide format */ PROC TRANSPOSE DATA=monthly_sales OUT=wide_sales; BY ID; ID Month; VAR Sales; RUN; PROC PRINT DATA=wide_sales; RUN;
BY ID; — Groups data by identifier (Customer/Product)ID Month; — Creates new columns based on month valuesVAR Sales; — Values to populate the new columnsOUT=wide_sales — Output dataset in wide formatemployee_salary, calculate the mean, minimum, and maximum of Salary.
▶
PROC MEANS DATA=employee_salary MEAN MIN MAX; VAR Salary; RUN;
PROC MEANS — Calculates descriptive statisticsMEAN MIN MAX — Specifies which statistics to displayVAR Salary; — Specifies the variable to analyzeDepartment variable using PROC FREQ.
▶
PROC FREQ DATA=employee_salary; TABLES Department; RUN;
PROC FREQ — Produces frequency tablesTABLES Department; — Shows count and percentage for each unique departmentstudent_marks, generate frequency distribution for Subject.
▶
PROC FREQ DATA=student_marks; TABLES Subject; RUN;
sales_data, calculate mean and sum of Quantity and Price.
▶
PROC MEANS DATA=sales_data MEAN SUM; VAR Quantity Price; RUN;
employee_salary, calculate average salary for each department.
▶
PROC MEANS DATA=employee_salary MEAN; CLASS Department; VAR Salary; RUN;
CLASS Department; — Groups results by departmentstudent_data, generate frequency distribution for Course variable.
▶
PROC FREQ DATA=student_data; TABLES Course; RUN;
student_scores, perform a t-test to compare marks of male and female students.
▶
PROC TTEST DATA=student_scores; CLASS Gender; VAR Marks; RUN;
PROC TTEST — Performs an independent samples t-testCLASS Gender; — Defines the two groups to compareVAR Marks; — The continuous variable being comparedexam_result, perform Chi-square test between Gender and Pass/Fail.
▶
PROC FREQ DATA=exam_result; TABLES Gender*Result / CHISQ; RUN;
Gender*Result — Creates a cross-tabulation (contingency table)/ CHISQ — Requests the Chi-square test statistic and p-valueTotal_Sales = Quantity × Price in dataset sales_data.
▶
DATA sales_updated; SET sales_data; Total_Sales = Quantity * Price; RUN; PROC PRINT DATA=sales_updated; RUN;
Discount_Price = Price × 0.9 and display the dataset.
▶
DATA sales_updated; SET sales_data; Discount_Price = Price * 0.9; RUN; PROC PRINT DATA=sales_updated; RUN;
employee_salary, calculate mean and standard deviation of salary.
▶
PROC MEANS DATA=employee_salary MEAN STD; VAR Salary; RUN;
advertising_data — Sales as dependent, TV as independent variable.
▶
PROC REG DATA=advertising_data; MODEL Sales = TV; RUN; QUIT;
PROC REG — Performs OLS regressionMODEL Sales = TV; — Sales is the dependent variable, TV is the predictorQUIT; — Required to exit interactive PROC REGPROC REG DATA=advertising_data; MODEL Sales = TV Radio; RUN; QUIT;
= performs multiple linear regressionODS PDF FILE="output.pdf"; PROC PRINT DATA=employee_salary; RUN; ODS PDF CLOSE;
ODS PDF FILE= — Opens a PDF output destinationODS PDF and ODS PDF CLOSE are written to the fileODS PDF CLOSE; — Closes and saves the PDFPROC MEANS DATA=employee_salary MEAN; CLASS Department; VAR Salary; RUN;
CLASS Department; — Groups results by each department